Home | History | Annotate | Download | only in X86
      1 //===-- X86InstrSSE.td - SSE Instruction Set ---------------*- tablegen -*-===//
      2 //
      3 //                     The LLVM Compiler Infrastructure
      4 //
      5 // This file is distributed under the University of Illinois Open Source
      6 // License. See LICENSE.TXT for details.
      7 //
      8 //===----------------------------------------------------------------------===//
      9 //
     10 // This file describes the X86 SSE instruction set, defining the instructions,
     11 // and properties of the instructions which are needed for code generation,
     12 // machine code emission, and analysis.
     13 //
     14 //===----------------------------------------------------------------------===//
     15 
     16 class OpndItins<InstrItinClass arg_rr, InstrItinClass arg_rm> {
     17   InstrItinClass rr = arg_rr;
     18   InstrItinClass rm = arg_rm;
     19   // InstrSchedModel info.
     20   X86FoldableSchedWrite Sched = WriteFAdd;
     21 }
     22 
     23 class SizeItins<OpndItins arg_s, OpndItins arg_d> {
     24   OpndItins s = arg_s;
     25   OpndItins d = arg_d;
     26 }
     27 
     28 
     29 class ShiftOpndItins<InstrItinClass arg_rr, InstrItinClass arg_rm,
     30   InstrItinClass arg_ri> {
     31   InstrItinClass rr = arg_rr;
     32   InstrItinClass rm = arg_rm;
     33   InstrItinClass ri = arg_ri;
     34 }
     35 
     36 
     37 // scalar
     38 let Sched = WriteFAdd in {
     39 def SSE_ALU_F32S : OpndItins<
     40   IIC_SSE_ALU_F32S_RR, IIC_SSE_ALU_F32S_RM
     41 >;
     42 
     43 def SSE_ALU_F64S : OpndItins<
     44   IIC_SSE_ALU_F64S_RR, IIC_SSE_ALU_F64S_RM
     45 >;
     46 }
     47 
     48 def SSE_ALU_ITINS_S : SizeItins<
     49   SSE_ALU_F32S, SSE_ALU_F64S
     50 >;
     51 
     52 let Sched = WriteFMul in {
     53 def SSE_MUL_F32S : OpndItins<
     54   IIC_SSE_MUL_F32S_RR, IIC_SSE_MUL_F64S_RM
     55 >;
     56 
     57 def SSE_MUL_F64S : OpndItins<
     58   IIC_SSE_MUL_F64S_RR, IIC_SSE_MUL_F64S_RM
     59 >;
     60 }
     61 
     62 def SSE_MUL_ITINS_S : SizeItins<
     63   SSE_MUL_F32S, SSE_MUL_F64S
     64 >;
     65 
     66 let Sched = WriteFDiv in {
     67 def SSE_DIV_F32S : OpndItins<
     68   IIC_SSE_DIV_F32S_RR, IIC_SSE_DIV_F64S_RM
     69 >;
     70 
     71 def SSE_DIV_F64S : OpndItins<
     72   IIC_SSE_DIV_F64S_RR, IIC_SSE_DIV_F64S_RM
     73 >;
     74 }
     75 
     76 def SSE_DIV_ITINS_S : SizeItins<
     77   SSE_DIV_F32S, SSE_DIV_F64S
     78 >;
     79 
     80 // parallel
     81 let Sched = WriteFAdd in {
     82 def SSE_ALU_F32P : OpndItins<
     83   IIC_SSE_ALU_F32P_RR, IIC_SSE_ALU_F32P_RM
     84 >;
     85 
     86 def SSE_ALU_F64P : OpndItins<
     87   IIC_SSE_ALU_F64P_RR, IIC_SSE_ALU_F64P_RM
     88 >;
     89 }
     90 
     91 def SSE_ALU_ITINS_P : SizeItins<
     92   SSE_ALU_F32P, SSE_ALU_F64P
     93 >;
     94 
     95 let Sched = WriteFMul in {
     96 def SSE_MUL_F32P : OpndItins<
     97   IIC_SSE_MUL_F32P_RR, IIC_SSE_MUL_F64P_RM
     98 >;
     99 
    100 def SSE_MUL_F64P : OpndItins<
    101   IIC_SSE_MUL_F64P_RR, IIC_SSE_MUL_F64P_RM
    102 >;
    103 }
    104 
    105 def SSE_MUL_ITINS_P : SizeItins<
    106   SSE_MUL_F32P, SSE_MUL_F64P
    107 >;
    108 
    109 let Sched = WriteFDiv in {
    110 def SSE_DIV_F32P : OpndItins<
    111   IIC_SSE_DIV_F32P_RR, IIC_SSE_DIV_F64P_RM
    112 >;
    113 
    114 def SSE_DIV_F64P : OpndItins<
    115   IIC_SSE_DIV_F64P_RR, IIC_SSE_DIV_F64P_RM
    116 >;
    117 }
    118 
    119 def SSE_DIV_ITINS_P : SizeItins<
    120   SSE_DIV_F32P, SSE_DIV_F64P
    121 >;
    122 
    123 let Sched = WriteVecLogic in
    124 def SSE_VEC_BIT_ITINS_P : OpndItins<
    125   IIC_SSE_BIT_P_RR, IIC_SSE_BIT_P_RM
    126 >;
    127 
    128 def SSE_BIT_ITINS_P : OpndItins<
    129   IIC_SSE_BIT_P_RR, IIC_SSE_BIT_P_RM
    130 >;
    131 
    132 let Sched = WriteVecALU in {
    133 def SSE_INTALU_ITINS_P : OpndItins<
    134   IIC_SSE_INTALU_P_RR, IIC_SSE_INTALU_P_RM
    135 >;
    136 
    137 def SSE_INTALUQ_ITINS_P : OpndItins<
    138   IIC_SSE_INTALUQ_P_RR, IIC_SSE_INTALUQ_P_RM
    139 >;
    140 }
    141 
    142 let Sched = WriteVecIMul in
    143 def SSE_INTMUL_ITINS_P : OpndItins<
    144   IIC_SSE_INTMUL_P_RR, IIC_SSE_INTMUL_P_RM
    145 >;
    146 
    147 def SSE_INTSHIFT_ITINS_P : ShiftOpndItins<
    148   IIC_SSE_INTSH_P_RR, IIC_SSE_INTSH_P_RM, IIC_SSE_INTSH_P_RI
    149 >;
    150 
    151 def SSE_MOVA_ITINS : OpndItins<
    152   IIC_SSE_MOVA_P_RR, IIC_SSE_MOVA_P_RM
    153 >;
    154 
    155 def SSE_MOVU_ITINS : OpndItins<
    156   IIC_SSE_MOVU_P_RR, IIC_SSE_MOVU_P_RM
    157 >;
    158 
    159 def SSE_DPPD_ITINS : OpndItins<
    160   IIC_SSE_DPPD_RR, IIC_SSE_DPPD_RM
    161 >;
    162 
    163 def SSE_DPPS_ITINS : OpndItins<
    164   IIC_SSE_DPPS_RR, IIC_SSE_DPPD_RM
    165 >;
    166 
    167 def DEFAULT_ITINS : OpndItins<
    168   IIC_ALU_NONMEM, IIC_ALU_MEM
    169 >;
    170 
    171 def SSE_EXTRACT_ITINS : OpndItins<
    172   IIC_SSE_EXTRACTPS_RR, IIC_SSE_EXTRACTPS_RM
    173 >;
    174 
    175 def SSE_INSERT_ITINS : OpndItins<
    176   IIC_SSE_INSERTPS_RR, IIC_SSE_INSERTPS_RM
    177 >;
    178 
    179 let Sched = WriteMPSAD in
    180 def SSE_MPSADBW_ITINS : OpndItins<
    181   IIC_SSE_MPSADBW_RR, IIC_SSE_MPSADBW_RM
    182 >;
    183 
    184 let Sched = WriteVecIMul in
    185 def SSE_PMULLD_ITINS : OpndItins<
    186   IIC_SSE_PMULLD_RR, IIC_SSE_PMULLD_RM
    187 >;
    188 
    189 // Definitions for backward compatibility.
    190 // The instructions mapped on these definitions uses a different itinerary
    191 // than the actual scheduling model.
    192 let Sched = WriteShuffle in
    193 def DEFAULT_ITINS_SHUFFLESCHED :  OpndItins<
    194   IIC_ALU_NONMEM, IIC_ALU_MEM
    195 >;
    196 
    197 let Sched = WriteVecIMul in
    198 def DEFAULT_ITINS_VECIMULSCHED :  OpndItins<
    199   IIC_ALU_NONMEM, IIC_ALU_MEM
    200 >;
    201 
    202 let Sched = WriteShuffle in
    203 def SSE_INTALU_ITINS_SHUFF_P : OpndItins<
    204   IIC_SSE_INTALU_P_RR, IIC_SSE_INTALU_P_RM
    205 >;
    206 
    207 let Sched = WriteMPSAD in
    208 def DEFAULT_ITINS_MPSADSCHED :  OpndItins<
    209   IIC_ALU_NONMEM, IIC_ALU_MEM
    210 >;
    211 
    212 let Sched = WriteFBlend in
    213 def DEFAULT_ITINS_FBLENDSCHED :  OpndItins<
    214   IIC_ALU_NONMEM, IIC_ALU_MEM
    215 >;
    216 
    217 let Sched = WriteBlend in
    218 def DEFAULT_ITINS_BLENDSCHED :  OpndItins<
    219   IIC_ALU_NONMEM, IIC_ALU_MEM
    220 >;
    221 
    222 let Sched = WriteVarBlend in
    223 def DEFAULT_ITINS_VARBLENDSCHED :  OpndItins<
    224   IIC_ALU_NONMEM, IIC_ALU_MEM
    225 >;
    226 
    227 let Sched = WriteFBlend in
    228 def SSE_INTALU_ITINS_FBLEND_P : OpndItins<
    229   IIC_SSE_INTALU_P_RR, IIC_SSE_INTALU_P_RM
    230 >;
    231 
    232 let Sched = WriteBlend in
    233 def SSE_INTALU_ITINS_BLEND_P : OpndItins<
    234   IIC_SSE_INTALU_P_RR, IIC_SSE_INTALU_P_RM
    235 >;
    236 
    237 //===----------------------------------------------------------------------===//
    238 // SSE 1 & 2 Instructions Classes
    239 //===----------------------------------------------------------------------===//
    240 
    241 /// sse12_fp_scalar - SSE 1 & 2 scalar instructions class
    242 multiclass sse12_fp_scalar<bits<8> opc, string OpcodeStr, SDNode OpNode,
    243                            RegisterClass RC, X86MemOperand x86memop,
    244                            Domain d, OpndItins itins, bit Is2Addr = 1> {
    245   let isCommutable = 1 in {
    246     def rr : SI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
    247        !if(Is2Addr,
    248            !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
    249            !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
    250        [(set RC:$dst, (OpNode RC:$src1, RC:$src2))], itins.rr, d>,
    251        Sched<[itins.Sched]>;
    252   }
    253   def rm : SI<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
    254        !if(Is2Addr,
    255            !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
    256            !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
    257        [(set RC:$dst, (OpNode RC:$src1, (load addr:$src2)))], itins.rm, d>,
    258        Sched<[itins.Sched.Folded, ReadAfterLd]>;
    259 }
    260 
    261 /// sse12_fp_scalar_int - SSE 1 & 2 scalar instructions intrinsics class
    262 multiclass sse12_fp_scalar_int<bits<8> opc, string OpcodeStr, RegisterClass RC,
    263                              string asm, string SSEVer, string FPSizeStr,
    264                              Operand memopr, ComplexPattern mem_cpat,
    265                              Domain d, OpndItins itins, bit Is2Addr = 1> {
    266 let isCodeGenOnly = 1 in {
    267   def rr_Int : SI_Int<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
    268        !if(Is2Addr,
    269            !strconcat(asm, "\t{$src2, $dst|$dst, $src2}"),
    270            !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
    271        [(set RC:$dst, (!cast<Intrinsic>(
    272                  !strconcat("int_x86_sse", SSEVer, "_", OpcodeStr, FPSizeStr))
    273              RC:$src1, RC:$src2))], itins.rr, d>,
    274        Sched<[itins.Sched]>;
    275   def rm_Int : SI_Int<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, memopr:$src2),
    276        !if(Is2Addr,
    277            !strconcat(asm, "\t{$src2, $dst|$dst, $src2}"),
    278            !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
    279        [(set RC:$dst, (!cast<Intrinsic>(!strconcat("int_x86_sse",
    280                                           SSEVer, "_", OpcodeStr, FPSizeStr))
    281              RC:$src1, mem_cpat:$src2))], itins.rm, d>,
    282        Sched<[itins.Sched.Folded, ReadAfterLd]>;
    283 }
    284 }
    285 
    286 /// sse12_fp_packed - SSE 1 & 2 packed instructions class
    287 multiclass sse12_fp_packed<bits<8> opc, string OpcodeStr, SDNode OpNode,
    288                            RegisterClass RC, ValueType vt,
    289                            X86MemOperand x86memop, PatFrag mem_frag,
    290                            Domain d, OpndItins itins, bit Is2Addr = 1> {
    291   let isCommutable = 1 in
    292     def rr : PI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
    293        !if(Is2Addr,
    294            !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
    295            !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
    296        [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))], itins.rr, d>,
    297        Sched<[itins.Sched]>;
    298   let mayLoad = 1 in
    299     def rm : PI<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
    300        !if(Is2Addr,
    301            !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
    302            !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
    303        [(set RC:$dst, (OpNode RC:$src1, (mem_frag addr:$src2)))],
    304           itins.rm, d>,
    305        Sched<[itins.Sched.Folded, ReadAfterLd]>;
    306 }
    307 
    308 /// sse12_fp_packed_logical_rm - SSE 1 & 2 packed instructions class
    309 multiclass sse12_fp_packed_logical_rm<bits<8> opc, RegisterClass RC, Domain d,
    310                                       string OpcodeStr, X86MemOperand x86memop,
    311                                       list<dag> pat_rr, list<dag> pat_rm,
    312                                       bit Is2Addr = 1> {
    313   let isCommutable = 1, hasSideEffects = 0 in
    314     def rr : PI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
    315        !if(Is2Addr,
    316            !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
    317            !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
    318        pat_rr, NoItinerary, d>,
    319        Sched<[WriteVecLogic]>;
    320   def rm : PI<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
    321        !if(Is2Addr,
    322            !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
    323            !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
    324        pat_rm, NoItinerary, d>,
    325        Sched<[WriteVecLogicLd, ReadAfterLd]>;
    326 }
    327 
    328 //===----------------------------------------------------------------------===//
    329 //  Non-instruction patterns
    330 //===----------------------------------------------------------------------===//
    331 
    332 // A vector extract of the first f32/f64 position is a subregister copy
    333 def : Pat<(f32 (extractelt (v4f32 VR128:$src), (iPTR 0))),
    334           (COPY_TO_REGCLASS (v4f32 VR128:$src), FR32)>;
    335 def : Pat<(f64 (extractelt (v2f64 VR128:$src), (iPTR 0))),
    336           (COPY_TO_REGCLASS (v2f64 VR128:$src), FR64)>;
    337 
    338 // A 128-bit subvector extract from the first 256-bit vector position
    339 // is a subregister copy that needs no instruction.
    340 def : Pat<(v4i32 (extract_subvector (v8i32 VR256:$src), (iPTR 0))),
    341           (v4i32 (EXTRACT_SUBREG (v8i32 VR256:$src), sub_xmm))>;
    342 def : Pat<(v4f32 (extract_subvector (v8f32 VR256:$src), (iPTR 0))),
    343           (v4f32 (EXTRACT_SUBREG (v8f32 VR256:$src), sub_xmm))>;
    344 
    345 def : Pat<(v2i64 (extract_subvector (v4i64 VR256:$src), (iPTR 0))),
    346           (v2i64 (EXTRACT_SUBREG (v4i64 VR256:$src), sub_xmm))>;
    347 def : Pat<(v2f64 (extract_subvector (v4f64 VR256:$src), (iPTR 0))),
    348           (v2f64 (EXTRACT_SUBREG (v4f64 VR256:$src), sub_xmm))>;
    349 
    350 def : Pat<(v8i16 (extract_subvector (v16i16 VR256:$src), (iPTR 0))),
    351           (v8i16 (EXTRACT_SUBREG (v16i16 VR256:$src), sub_xmm))>;
    352 def : Pat<(v16i8 (extract_subvector (v32i8 VR256:$src), (iPTR 0))),
    353           (v16i8 (EXTRACT_SUBREG (v32i8 VR256:$src), sub_xmm))>;
    354 
    355 // A 128-bit subvector insert to the first 256-bit vector position
    356 // is a subregister copy that needs no instruction.
    357 let AddedComplexity = 25 in { // to give priority over vinsertf128rm
    358 def : Pat<(insert_subvector undef, (v2i64 VR128:$src), (iPTR 0)),
    359           (INSERT_SUBREG (v4i64 (IMPLICIT_DEF)), VR128:$src, sub_xmm)>;
    360 def : Pat<(insert_subvector undef, (v2f64 VR128:$src), (iPTR 0)),
    361           (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)), VR128:$src, sub_xmm)>;
    362 def : Pat<(insert_subvector undef, (v4i32 VR128:$src), (iPTR 0)),
    363           (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), VR128:$src, sub_xmm)>;
    364 def : Pat<(insert_subvector undef, (v4f32 VR128:$src), (iPTR 0)),
    365           (INSERT_SUBREG (v8f32 (IMPLICIT_DEF)), VR128:$src, sub_xmm)>;
    366 def : Pat<(insert_subvector undef, (v8i16 VR128:$src), (iPTR 0)),
    367           (INSERT_SUBREG (v16i16 (IMPLICIT_DEF)), VR128:$src, sub_xmm)>;
    368 def : Pat<(insert_subvector undef, (v16i8 VR128:$src), (iPTR 0)),
    369           (INSERT_SUBREG (v32i8 (IMPLICIT_DEF)), VR128:$src, sub_xmm)>;
    370 }
    371 
    372 // Implicitly promote a 32-bit scalar to a vector.
    373 def : Pat<(v4f32 (scalar_to_vector FR32:$src)),
    374           (COPY_TO_REGCLASS FR32:$src, VR128)>;
    375 def : Pat<(v8f32 (scalar_to_vector FR32:$src)),
    376           (COPY_TO_REGCLASS FR32:$src, VR128)>;
    377 // Implicitly promote a 64-bit scalar to a vector.
    378 def : Pat<(v2f64 (scalar_to_vector FR64:$src)),
    379           (COPY_TO_REGCLASS FR64:$src, VR128)>;
    380 def : Pat<(v4f64 (scalar_to_vector FR64:$src)),
    381           (COPY_TO_REGCLASS FR64:$src, VR128)>;
    382 
    383 // Bitcasts between 128-bit vector types. Return the original type since
    384 // no instruction is needed for the conversion
    385 def : Pat<(v2i64 (bitconvert (v4i32 VR128:$src))), (v2i64 VR128:$src)>;
    386 def : Pat<(v2i64 (bitconvert (v8i16 VR128:$src))), (v2i64 VR128:$src)>;
    387 def : Pat<(v2i64 (bitconvert (v16i8 VR128:$src))), (v2i64 VR128:$src)>;
    388 def : Pat<(v2i64 (bitconvert (v2f64 VR128:$src))), (v2i64 VR128:$src)>;
    389 def : Pat<(v2i64 (bitconvert (v4f32 VR128:$src))), (v2i64 VR128:$src)>;
    390 def : Pat<(v4i32 (bitconvert (v2i64 VR128:$src))), (v4i32 VR128:$src)>;
    391 def : Pat<(v4i32 (bitconvert (v8i16 VR128:$src))), (v4i32 VR128:$src)>;
    392 def : Pat<(v4i32 (bitconvert (v16i8 VR128:$src))), (v4i32 VR128:$src)>;
    393 def : Pat<(v4i32 (bitconvert (v2f64 VR128:$src))), (v4i32 VR128:$src)>;
    394 def : Pat<(v4i32 (bitconvert (v4f32 VR128:$src))), (v4i32 VR128:$src)>;
    395 def : Pat<(v8i16 (bitconvert (v2i64 VR128:$src))), (v8i16 VR128:$src)>;
    396 def : Pat<(v8i16 (bitconvert (v4i32 VR128:$src))), (v8i16 VR128:$src)>;
    397 def : Pat<(v8i16 (bitconvert (v16i8 VR128:$src))), (v8i16 VR128:$src)>;
    398 def : Pat<(v8i16 (bitconvert (v2f64 VR128:$src))), (v8i16 VR128:$src)>;
    399 def : Pat<(v8i16 (bitconvert (v4f32 VR128:$src))), (v8i16 VR128:$src)>;
    400 def : Pat<(v16i8 (bitconvert (v2i64 VR128:$src))), (v16i8 VR128:$src)>;
    401 def : Pat<(v16i8 (bitconvert (v4i32 VR128:$src))), (v16i8 VR128:$src)>;
    402 def : Pat<(v16i8 (bitconvert (v8i16 VR128:$src))), (v16i8 VR128:$src)>;
    403 def : Pat<(v16i8 (bitconvert (v2f64 VR128:$src))), (v16i8 VR128:$src)>;
    404 def : Pat<(v16i8 (bitconvert (v4f32 VR128:$src))), (v16i8 VR128:$src)>;
    405 def : Pat<(v4f32 (bitconvert (v2i64 VR128:$src))), (v4f32 VR128:$src)>;
    406 def : Pat<(v4f32 (bitconvert (v4i32 VR128:$src))), (v4f32 VR128:$src)>;
    407 def : Pat<(v4f32 (bitconvert (v8i16 VR128:$src))), (v4f32 VR128:$src)>;
    408 def : Pat<(v4f32 (bitconvert (v16i8 VR128:$src))), (v4f32 VR128:$src)>;
    409 def : Pat<(v4f32 (bitconvert (v2f64 VR128:$src))), (v4f32 VR128:$src)>;
    410 def : Pat<(v2f64 (bitconvert (v2i64 VR128:$src))), (v2f64 VR128:$src)>;
    411 def : Pat<(v2f64 (bitconvert (v4i32 VR128:$src))), (v2f64 VR128:$src)>;
    412 def : Pat<(v2f64 (bitconvert (v8i16 VR128:$src))), (v2f64 VR128:$src)>;
    413 def : Pat<(v2f64 (bitconvert (v16i8 VR128:$src))), (v2f64 VR128:$src)>;
    414 def : Pat<(v2f64 (bitconvert (v4f32 VR128:$src))), (v2f64 VR128:$src)>;
    415 def : Pat<(f128  (bitconvert (i128  FR128:$src))), (f128  FR128:$src)>;
    416 def : Pat<(i128  (bitconvert (f128  FR128:$src))), (i128  FR128:$src)>;
    417 
    418 // Bitcasts between 256-bit vector types. Return the original type since
    419 // no instruction is needed for the conversion
    420 def : Pat<(v4i64  (bitconvert (v8i32  VR256:$src))), (v4i64  VR256:$src)>;
    421 def : Pat<(v4i64  (bitconvert (v16i16 VR256:$src))), (v4i64  VR256:$src)>;
    422 def : Pat<(v4i64  (bitconvert (v32i8  VR256:$src))), (v4i64  VR256:$src)>;
    423 def : Pat<(v4i64  (bitconvert (v8f32  VR256:$src))), (v4i64  VR256:$src)>;
    424 def : Pat<(v4i64  (bitconvert (v4f64  VR256:$src))), (v4i64  VR256:$src)>;
    425 def : Pat<(v8i32  (bitconvert (v4i64  VR256:$src))), (v8i32  VR256:$src)>;
    426 def : Pat<(v8i32  (bitconvert (v16i16 VR256:$src))), (v8i32  VR256:$src)>;
    427 def : Pat<(v8i32  (bitconvert (v32i8  VR256:$src))), (v8i32  VR256:$src)>;
    428 def : Pat<(v8i32  (bitconvert (v4f64  VR256:$src))), (v8i32  VR256:$src)>;
    429 def : Pat<(v8i32  (bitconvert (v8f32  VR256:$src))), (v8i32  VR256:$src)>;
    430 def : Pat<(v16i16 (bitconvert (v4i64  VR256:$src))), (v16i16 VR256:$src)>;
    431 def : Pat<(v16i16 (bitconvert (v8i32  VR256:$src))), (v16i16 VR256:$src)>;
    432 def : Pat<(v16i16 (bitconvert (v32i8  VR256:$src))), (v16i16 VR256:$src)>;
    433 def : Pat<(v16i16 (bitconvert (v4f64  VR256:$src))), (v16i16 VR256:$src)>;
    434 def : Pat<(v16i16 (bitconvert (v8f32  VR256:$src))), (v16i16 VR256:$src)>;
    435 def : Pat<(v32i8  (bitconvert (v4i64  VR256:$src))), (v32i8  VR256:$src)>;
    436 def : Pat<(v32i8  (bitconvert (v8i32  VR256:$src))), (v32i8  VR256:$src)>;
    437 def : Pat<(v32i8  (bitconvert (v16i16 VR256:$src))), (v32i8  VR256:$src)>;
    438 def : Pat<(v32i8  (bitconvert (v4f64  VR256:$src))), (v32i8  VR256:$src)>;
    439 def : Pat<(v32i8  (bitconvert (v8f32  VR256:$src))), (v32i8  VR256:$src)>;
    440 def : Pat<(v8f32  (bitconvert (v4i64  VR256:$src))), (v8f32  VR256:$src)>;
    441 def : Pat<(v8f32  (bitconvert (v8i32  VR256:$src))), (v8f32  VR256:$src)>;
    442 def : Pat<(v8f32  (bitconvert (v16i16 VR256:$src))), (v8f32  VR256:$src)>;
    443 def : Pat<(v8f32  (bitconvert (v32i8  VR256:$src))), (v8f32  VR256:$src)>;
    444 def : Pat<(v8f32  (bitconvert (v4f64  VR256:$src))), (v8f32  VR256:$src)>;
    445 def : Pat<(v4f64  (bitconvert (v4i64  VR256:$src))), (v4f64  VR256:$src)>;
    446 def : Pat<(v4f64  (bitconvert (v8i32  VR256:$src))), (v4f64  VR256:$src)>;
    447 def : Pat<(v4f64  (bitconvert (v16i16 VR256:$src))), (v4f64  VR256:$src)>;
    448 def : Pat<(v4f64  (bitconvert (v32i8  VR256:$src))), (v4f64  VR256:$src)>;
    449 def : Pat<(v4f64  (bitconvert (v8f32  VR256:$src))), (v4f64  VR256:$src)>;
    450 
    451 // Alias instructions that map fld0 to xorps for sse or vxorps for avx.
    452 // This is expanded by ExpandPostRAPseudos.
    453 let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
    454     isPseudo = 1, SchedRW = [WriteZero] in {
    455   def FsFLD0SS : I<0, Pseudo, (outs FR32:$dst), (ins), "",
    456                    [(set FR32:$dst, fp32imm0)]>, Requires<[HasSSE1]>;
    457   def FsFLD0SD : I<0, Pseudo, (outs FR64:$dst), (ins), "",
    458                    [(set FR64:$dst, fpimm0)]>, Requires<[HasSSE2]>;
    459 }
    460 
    461 //===----------------------------------------------------------------------===//
    462 // AVX & SSE - Zero/One Vectors
    463 //===----------------------------------------------------------------------===//
    464 
    465 // Alias instruction that maps zero vector to pxor / xorp* for sse.
    466 // This is expanded by ExpandPostRAPseudos to an xorps / vxorps, and then
    467 // swizzled by ExecutionDepsFix to pxor.
    468 // We set canFoldAsLoad because this can be converted to a constant-pool
    469 // load of an all-zeros value if folding it would be beneficial.
    470 let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
    471     isPseudo = 1, Predicates = [NoVLX], SchedRW = [WriteZero] in {
    472 def V_SET0 : I<0, Pseudo, (outs VR128:$dst), (ins), "",
    473                [(set VR128:$dst, (v4f32 immAllZerosV))]>;
    474 }
    475 
    476 let Predicates = [NoVLX] in
    477 def : Pat<(v4i32 immAllZerosV), (V_SET0)>;
    478 
    479 
    480 // The same as done above but for AVX.  The 256-bit AVX1 ISA doesn't support PI,
    481 // and doesn't need it because on sandy bridge the register is set to zero
    482 // at the rename stage without using any execution unit, so SET0PSY
    483 // and SET0PDY can be used for vector int instructions without penalty
    484 let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
    485     isPseudo = 1, Predicates = [HasAVX, NoVLX], SchedRW = [WriteZero] in {
    486 def AVX_SET0 : I<0, Pseudo, (outs VR256:$dst), (ins), "",
    487                  [(set VR256:$dst, (v8i32 immAllZerosV))]>;
    488 }
    489 
    490 // We set canFoldAsLoad because this can be converted to a constant-pool
    491 // load of an all-ones value if folding it would be beneficial.
    492 let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
    493     isPseudo = 1, SchedRW = [WriteZero] in {
    494   def V_SETALLONES : I<0, Pseudo, (outs VR128:$dst), (ins), "",
    495                        [(set VR128:$dst, (v4i32 immAllOnesV))]>;
    496   let Predicates = [HasAVX2] in
    497   def AVX2_SETALLONES : I<0, Pseudo, (outs VR256:$dst), (ins), "",
    498                           [(set VR256:$dst, (v8i32 immAllOnesV))]>;
    499 }
    500 
    501 
    502 //===----------------------------------------------------------------------===//
    503 // SSE 1 & 2 - Move FP Scalar Instructions
    504 //
    505 // Move Instructions. Register-to-register movss/movsd is not used for FR32/64
    506 // register copies because it's a partial register update; Register-to-register
    507 // movss/movsd is not modeled as an INSERT_SUBREG because INSERT_SUBREG requires
    508 // that the insert be implementable in terms of a copy, and just mentioned, we
    509 // don't use movss/movsd for copies.
    510 //===----------------------------------------------------------------------===//
    511 
    512 multiclass sse12_move_rr<RegisterClass RC, SDNode OpNode, ValueType vt,
    513                          X86MemOperand x86memop, string base_opc,
    514                          string asm_opr, Domain d = GenericDomain> {
    515   def rr : SI<0x10, MRMSrcReg, (outs VR128:$dst),
    516               (ins VR128:$src1, RC:$src2),
    517               !strconcat(base_opc, asm_opr),
    518               [(set VR128:$dst, (vt (OpNode VR128:$src1,
    519                                  (scalar_to_vector RC:$src2))))],
    520               IIC_SSE_MOV_S_RR, d>, Sched<[WriteFShuffle]>;
    521 
    522   // For the disassembler
    523   let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in
    524   def rr_REV : SI<0x11, MRMDestReg, (outs VR128:$dst),
    525                   (ins VR128:$src1, RC:$src2),
    526                   !strconcat(base_opc, asm_opr),
    527                   [], IIC_SSE_MOV_S_RR>, Sched<[WriteFShuffle]>;
    528 }
    529 
    530 multiclass sse12_move<RegisterClass RC, SDNode OpNode, ValueType vt,
    531                       X86MemOperand x86memop, string OpcodeStr,
    532                       Domain d = GenericDomain> {
    533   // AVX
    534   defm V#NAME : sse12_move_rr<RC, OpNode, vt, x86memop, OpcodeStr,
    535                               "\t{$src2, $src1, $dst|$dst, $src1, $src2}", d>,
    536                               VEX_4V, VEX_LIG;
    537 
    538   def V#NAME#mr : SI<0x11, MRMDestMem, (outs), (ins x86memop:$dst, RC:$src),
    539                      !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
    540                      [(store RC:$src, addr:$dst)], IIC_SSE_MOV_S_MR, d>,
    541                      VEX, VEX_LIG, Sched<[WriteStore]>;
    542   // SSE1 & 2
    543   let Constraints = "$src1 = $dst" in {
    544     defm NAME : sse12_move_rr<RC, OpNode, vt, x86memop, OpcodeStr,
    545                               "\t{$src2, $dst|$dst, $src2}", d>;
    546   }
    547 
    548   def NAME#mr   : SI<0x11, MRMDestMem, (outs), (ins x86memop:$dst, RC:$src),
    549                      !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
    550                      [(store RC:$src, addr:$dst)], IIC_SSE_MOV_S_MR, d>,
    551                   Sched<[WriteStore]>;
    552 }
    553 
    554 // Loading from memory automatically zeroing upper bits.
    555 multiclass sse12_move_rm<RegisterClass RC, X86MemOperand x86memop,
    556                          PatFrag mem_pat, string OpcodeStr,
    557                          Domain d = GenericDomain> {
    558   def V#NAME#rm : SI<0x10, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
    559                      !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
    560                      [(set RC:$dst, (mem_pat addr:$src))],
    561                      IIC_SSE_MOV_S_RM, d>, VEX, VEX_LIG, Sched<[WriteLoad]>;
    562   def NAME#rm   : SI<0x10, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
    563                      !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
    564                      [(set RC:$dst, (mem_pat addr:$src))],
    565                      IIC_SSE_MOV_S_RM, d>, Sched<[WriteLoad]>;
    566 }
    567 
    568 defm MOVSS : sse12_move<FR32, X86Movss, v4f32, f32mem, "movss",
    569                         SSEPackedSingle>, XS;
    570 defm MOVSD : sse12_move<FR64, X86Movsd, v2f64, f64mem, "movsd",
    571                         SSEPackedDouble>, XD;
    572 
    573 let canFoldAsLoad = 1, isReMaterializable = 1 in {
    574   defm MOVSS : sse12_move_rm<FR32, f32mem, loadf32, "movss",
    575                              SSEPackedSingle>, XS;
    576 
    577   let AddedComplexity = 20 in
    578     defm MOVSD : sse12_move_rm<FR64, f64mem, loadf64, "movsd",
    579                                SSEPackedDouble>, XD;
    580 }
    581 
    582 // Patterns
    583 let Predicates = [UseAVX] in {
    584   let AddedComplexity = 20 in {
    585   // MOVSSrm zeros the high parts of the register; represent this
    586   // with SUBREG_TO_REG. The AVX versions also write: DST[255:128] <- 0
    587   def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector (loadf32 addr:$src))))),
    588             (COPY_TO_REGCLASS (VMOVSSrm addr:$src), VR128)>;
    589   def : Pat<(v4f32 (scalar_to_vector (loadf32 addr:$src))),
    590             (COPY_TO_REGCLASS (VMOVSSrm addr:$src), VR128)>;
    591   def : Pat<(v4f32 (X86vzmovl (loadv4f32 addr:$src))),
    592             (COPY_TO_REGCLASS (VMOVSSrm addr:$src), VR128)>;
    593 
    594   // MOVSDrm zeros the high parts of the register; represent this
    595   // with SUBREG_TO_REG. The AVX versions also write: DST[255:128] <- 0
    596   def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector (loadf64 addr:$src))))),
    597             (COPY_TO_REGCLASS (VMOVSDrm addr:$src), VR128)>;
    598   def : Pat<(v2f64 (scalar_to_vector (loadf64 addr:$src))),
    599             (COPY_TO_REGCLASS (VMOVSDrm addr:$src), VR128)>;
    600   def : Pat<(v2f64 (X86vzmovl (loadv2f64 addr:$src))),
    601             (COPY_TO_REGCLASS (VMOVSDrm addr:$src), VR128)>;
    602   def : Pat<(v2f64 (X86vzmovl (bc_v2f64 (loadv4f32 addr:$src)))),
    603             (COPY_TO_REGCLASS (VMOVSDrm addr:$src), VR128)>;
    604   def : Pat<(v2f64 (X86vzload addr:$src)),
    605             (COPY_TO_REGCLASS (VMOVSDrm addr:$src), VR128)>;
    606 
    607   // Represent the same patterns above but in the form they appear for
    608   // 256-bit types
    609   def : Pat<(v8f32 (X86vzmovl (insert_subvector undef,
    610                    (v4f32 (scalar_to_vector (loadf32 addr:$src))), (iPTR 0)))),
    611             (SUBREG_TO_REG (i32 0), (VMOVSSrm addr:$src), sub_xmm)>;
    612   def : Pat<(v4f64 (X86vzmovl (insert_subvector undef,
    613                    (v2f64 (scalar_to_vector (loadf64 addr:$src))), (iPTR 0)))),
    614             (SUBREG_TO_REG (i32 0), (VMOVSDrm addr:$src), sub_xmm)>;
    615   def : Pat<(v4f64 (X86vzload addr:$src)),
    616             (SUBREG_TO_REG (i32 0), (VMOVSDrm addr:$src), sub_xmm)>;
    617   }
    618 
    619   // Extract and store.
    620   def : Pat<(store (f32 (extractelt (v4f32 VR128:$src), (iPTR 0))),
    621                    addr:$dst),
    622             (VMOVSSmr addr:$dst, (COPY_TO_REGCLASS (v4f32 VR128:$src), FR32))>;
    623 
    624   // Shuffle with VMOVSS
    625   def : Pat<(v4i32 (X86Movss VR128:$src1, VR128:$src2)),
    626             (VMOVSSrr (v4i32 VR128:$src1),
    627                       (COPY_TO_REGCLASS (v4i32 VR128:$src2), FR32))>;
    628   def : Pat<(v4f32 (X86Movss VR128:$src1, VR128:$src2)),
    629             (VMOVSSrr (v4f32 VR128:$src1),
    630                       (COPY_TO_REGCLASS (v4f32 VR128:$src2), FR32))>;
    631 
    632   // 256-bit variants
    633   def : Pat<(v8i32 (X86Movss VR256:$src1, VR256:$src2)),
    634             (SUBREG_TO_REG (i32 0),
    635               (VMOVSSrr (EXTRACT_SUBREG (v8i32 VR256:$src1), sub_xmm),
    636                         (EXTRACT_SUBREG (v8i32 VR256:$src2), sub_xmm)),
    637               sub_xmm)>;
    638   def : Pat<(v8f32 (X86Movss VR256:$src1, VR256:$src2)),
    639             (SUBREG_TO_REG (i32 0),
    640               (VMOVSSrr (EXTRACT_SUBREG (v8f32 VR256:$src1), sub_xmm),
    641                         (EXTRACT_SUBREG (v8f32 VR256:$src2), sub_xmm)),
    642               sub_xmm)>;
    643 
    644   // Shuffle with VMOVSD
    645   def : Pat<(v2i64 (X86Movsd VR128:$src1, VR128:$src2)),
    646             (VMOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
    647   def : Pat<(v2f64 (X86Movsd VR128:$src1, VR128:$src2)),
    648             (VMOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
    649   def : Pat<(v4f32 (X86Movsd VR128:$src1, VR128:$src2)),
    650             (VMOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
    651   def : Pat<(v4i32 (X86Movsd VR128:$src1, VR128:$src2)),
    652             (VMOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
    653 
    654   // 256-bit variants
    655   def : Pat<(v4i64 (X86Movsd VR256:$src1, VR256:$src2)),
    656             (SUBREG_TO_REG (i32 0),
    657               (VMOVSDrr (EXTRACT_SUBREG (v4i64 VR256:$src1), sub_xmm),
    658                         (EXTRACT_SUBREG (v4i64 VR256:$src2), sub_xmm)),
    659               sub_xmm)>;
    660   def : Pat<(v4f64 (X86Movsd VR256:$src1, VR256:$src2)),
    661             (SUBREG_TO_REG (i32 0),
    662               (VMOVSDrr (EXTRACT_SUBREG (v4f64 VR256:$src1), sub_xmm),
    663                         (EXTRACT_SUBREG (v4f64 VR256:$src2), sub_xmm)),
    664               sub_xmm)>;
    665 
    666   // FIXME: Instead of a X86Movlps there should be a X86Movsd here, the problem
    667   // is during lowering, where it's not possible to recognize the fold cause
    668   // it has two uses through a bitcast. One use disappears at isel time and the
    669   // fold opportunity reappears.
    670   def : Pat<(v2f64 (X86Movlpd VR128:$src1, VR128:$src2)),
    671             (VMOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
    672   def : Pat<(v2i64 (X86Movlpd VR128:$src1, VR128:$src2)),
    673             (VMOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
    674   def : Pat<(v4f32 (X86Movlps VR128:$src1, VR128:$src2)),
    675             (VMOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
    676   def : Pat<(v4i32 (X86Movlps VR128:$src1, VR128:$src2)),
    677             (VMOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
    678 }
    679 
    680 let Predicates = [UseSSE1] in {
    681   let Predicates = [NoSSE41], AddedComplexity = 15 in {
    682   // Move scalar to XMM zero-extended, zeroing a VR128 then do a
    683   // MOVSS to the lower bits.
    684   def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector FR32:$src)))),
    685             (MOVSSrr (v4f32 (V_SET0)), FR32:$src)>;
    686   def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))),
    687             (MOVSSrr (v4f32 (V_SET0)), (COPY_TO_REGCLASS VR128:$src, FR32))>;
    688   def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))),
    689             (MOVSSrr (v4i32 (V_SET0)), (COPY_TO_REGCLASS VR128:$src, FR32))>;
    690   }
    691 
    692   let AddedComplexity = 20 in {
    693   // MOVSSrm already zeros the high parts of the register.
    694   def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector (loadf32 addr:$src))))),
    695             (COPY_TO_REGCLASS (MOVSSrm addr:$src), VR128)>;
    696   def : Pat<(v4f32 (scalar_to_vector (loadf32 addr:$src))),
    697             (COPY_TO_REGCLASS (MOVSSrm addr:$src), VR128)>;
    698   def : Pat<(v4f32 (X86vzmovl (loadv4f32 addr:$src))),
    699             (COPY_TO_REGCLASS (MOVSSrm addr:$src), VR128)>;
    700   }
    701 
    702   // Extract and store.
    703   def : Pat<(store (f32 (extractelt (v4f32 VR128:$src), (iPTR 0))),
    704                    addr:$dst),
    705             (MOVSSmr addr:$dst, (COPY_TO_REGCLASS VR128:$src, FR32))>;
    706 
    707   // Shuffle with MOVSS
    708   def : Pat<(v4i32 (X86Movss VR128:$src1, VR128:$src2)),
    709             (MOVSSrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR32))>;
    710   def : Pat<(v4f32 (X86Movss VR128:$src1, VR128:$src2)),
    711             (MOVSSrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR32))>;
    712 }
    713 
    714 let Predicates = [UseSSE2] in {
    715   let Predicates = [NoSSE41], AddedComplexity = 15 in {
    716   // Move scalar to XMM zero-extended, zeroing a VR128 then do a
    717   // MOVSD to the lower bits.
    718   def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector FR64:$src)))),
    719             (MOVSDrr (v2f64 (V_SET0)), FR64:$src)>;
    720   }
    721 
    722   let AddedComplexity = 20 in {
    723   // MOVSDrm already zeros the high parts of the register.
    724   def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector (loadf64 addr:$src))))),
    725             (COPY_TO_REGCLASS (MOVSDrm addr:$src), VR128)>;
    726   def : Pat<(v2f64 (scalar_to_vector (loadf64 addr:$src))),
    727             (COPY_TO_REGCLASS (MOVSDrm addr:$src), VR128)>;
    728   def : Pat<(v2f64 (X86vzmovl (loadv2f64 addr:$src))),
    729             (COPY_TO_REGCLASS (MOVSDrm addr:$src), VR128)>;
    730   def : Pat<(v2f64 (X86vzmovl (bc_v2f64 (loadv4f32 addr:$src)))),
    731             (COPY_TO_REGCLASS (MOVSDrm addr:$src), VR128)>;
    732   def : Pat<(v2f64 (X86vzload addr:$src)),
    733             (COPY_TO_REGCLASS (MOVSDrm addr:$src), VR128)>;
    734   }
    735 
    736   // Shuffle with MOVSD
    737   def : Pat<(v2i64 (X86Movsd VR128:$src1, VR128:$src2)),
    738             (MOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
    739   def : Pat<(v2f64 (X86Movsd VR128:$src1, VR128:$src2)),
    740             (MOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
    741   def : Pat<(v4f32 (X86Movsd VR128:$src1, VR128:$src2)),
    742             (MOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
    743   def : Pat<(v4i32 (X86Movsd VR128:$src1, VR128:$src2)),
    744             (MOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
    745 
    746   // FIXME: Instead of a X86Movlps there should be a X86Movsd here, the problem
    747   // is during lowering, where it's not possible to recognize the fold because
    748   // it has two uses through a bitcast. One use disappears at isel time and the
    749   // fold opportunity reappears.
    750   def : Pat<(v2f64 (X86Movlpd VR128:$src1, VR128:$src2)),
    751             (MOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
    752   def : Pat<(v2i64 (X86Movlpd VR128:$src1, VR128:$src2)),
    753             (MOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
    754   def : Pat<(v4f32 (X86Movlps VR128:$src1, VR128:$src2)),
    755             (MOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
    756   def : Pat<(v4i32 (X86Movlps VR128:$src1, VR128:$src2)),
    757             (MOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
    758 }
    759 
    760 // Aliases to help the assembler pick two byte VEX encodings by swapping the
    761 // operands relative to the normal instructions to use VEX.R instead of VEX.B.
    762 def : InstAlias<"vmovss\t{$src2, $src1, $dst|$dst, $src1, $src2}",
    763                 (VMOVSSrr_REV VR128L:$dst, VR128:$src1, VR128H:$src2), 0>;
    764 def : InstAlias<"vmovsd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
    765                 (VMOVSDrr_REV VR128L:$dst, VR128:$src1, VR128H:$src2), 0>;
    766 
    767 //===----------------------------------------------------------------------===//
    768 // SSE 1 & 2 - Move Aligned/Unaligned FP Instructions
    769 //===----------------------------------------------------------------------===//
    770 
    771 multiclass sse12_mov_packed<bits<8> opc, RegisterClass RC,
    772                             X86MemOperand x86memop, PatFrag ld_frag,
    773                             string asm, Domain d,
    774                             OpndItins itins,
    775                             bit IsReMaterializable = 1> {
    776 let hasSideEffects = 0 in
    777   def rr : PI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src),
    778               !strconcat(asm, "\t{$src, $dst|$dst, $src}"), [], itins.rr, d>,
    779            Sched<[WriteFShuffle]>;
    780 let canFoldAsLoad = 1, isReMaterializable = IsReMaterializable in
    781   def rm : PI<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
    782               !strconcat(asm, "\t{$src, $dst|$dst, $src}"),
    783                    [(set RC:$dst, (ld_frag addr:$src))], itins.rm, d>,
    784            Sched<[WriteLoad]>;
    785 }
    786 
    787 let Predicates = [HasAVX, NoVLX] in {
    788 defm VMOVAPS : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv4f32,
    789                               "movaps", SSEPackedSingle, SSE_MOVA_ITINS>,
    790                               PS, VEX;
    791 defm VMOVAPD : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv2f64,
    792                               "movapd", SSEPackedDouble, SSE_MOVA_ITINS>,
    793                               PD, VEX;
    794 defm VMOVUPS : sse12_mov_packed<0x10, VR128, f128mem, loadv4f32,
    795                               "movups", SSEPackedSingle, SSE_MOVU_ITINS>,
    796                               PS, VEX;
    797 defm VMOVUPD : sse12_mov_packed<0x10, VR128, f128mem, loadv2f64,
    798                               "movupd", SSEPackedDouble, SSE_MOVU_ITINS, 0>,
    799                               PD, VEX;
    800 
    801 defm VMOVAPSY : sse12_mov_packed<0x28, VR256, f256mem, alignedloadv8f32,
    802                               "movaps", SSEPackedSingle, SSE_MOVA_ITINS>,
    803                               PS, VEX, VEX_L;
    804 defm VMOVAPDY : sse12_mov_packed<0x28, VR256, f256mem, alignedloadv4f64,
    805                               "movapd", SSEPackedDouble, SSE_MOVA_ITINS>,
    806                               PD, VEX, VEX_L;
    807 defm VMOVUPSY : sse12_mov_packed<0x10, VR256, f256mem, loadv8f32,
    808                               "movups", SSEPackedSingle, SSE_MOVU_ITINS>,
    809                               PS, VEX, VEX_L;
    810 defm VMOVUPDY : sse12_mov_packed<0x10, VR256, f256mem, loadv4f64,
    811                               "movupd", SSEPackedDouble, SSE_MOVU_ITINS, 0>,
    812                               PD, VEX, VEX_L;
    813 }
    814 
    815 let Predicates = [UseSSE1] in {
    816 defm MOVAPS : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv4f32,
    817                               "movaps", SSEPackedSingle, SSE_MOVA_ITINS>,
    818                               PS;
    819 defm MOVUPS : sse12_mov_packed<0x10, VR128, f128mem, loadv4f32,
    820                               "movups", SSEPackedSingle, SSE_MOVU_ITINS>,
    821                               PS;
    822 }
    823 let Predicates = [UseSSE2] in {
    824 defm MOVAPD : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv2f64,
    825                               "movapd", SSEPackedDouble, SSE_MOVA_ITINS>,
    826                               PD;
    827 defm MOVUPD : sse12_mov_packed<0x10, VR128, f128mem, loadv2f64,
    828                               "movupd", SSEPackedDouble, SSE_MOVU_ITINS, 0>,
    829                               PD;
    830 }
    831 
    832 let SchedRW = [WriteStore], Predicates = [HasAVX, NoVLX]  in {
    833 def VMOVAPSmr : VPSI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
    834                    "movaps\t{$src, $dst|$dst, $src}",
    835                    [(alignedstore (v4f32 VR128:$src), addr:$dst)],
    836                    IIC_SSE_MOVA_P_MR>, VEX;
    837 def VMOVAPDmr : VPDI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
    838                    "movapd\t{$src, $dst|$dst, $src}",
    839                    [(alignedstore (v2f64 VR128:$src), addr:$dst)],
    840                    IIC_SSE_MOVA_P_MR>, VEX;
    841 def VMOVUPSmr : VPSI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
    842                    "movups\t{$src, $dst|$dst, $src}",
    843                    [(store (v4f32 VR128:$src), addr:$dst)],
    844                    IIC_SSE_MOVU_P_MR>, VEX;
    845 def VMOVUPDmr : VPDI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
    846                    "movupd\t{$src, $dst|$dst, $src}",
    847                    [(store (v2f64 VR128:$src), addr:$dst)],
    848                    IIC_SSE_MOVU_P_MR>, VEX;
    849 def VMOVAPSYmr : VPSI<0x29, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src),
    850                    "movaps\t{$src, $dst|$dst, $src}",
    851                    [(alignedstore256 (v8f32 VR256:$src), addr:$dst)],
    852                    IIC_SSE_MOVA_P_MR>, VEX, VEX_L;
    853 def VMOVAPDYmr : VPDI<0x29, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src),
    854                    "movapd\t{$src, $dst|$dst, $src}",
    855                    [(alignedstore256 (v4f64 VR256:$src), addr:$dst)],
    856                    IIC_SSE_MOVA_P_MR>, VEX, VEX_L;
    857 def VMOVUPSYmr : VPSI<0x11, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src),
    858                    "movups\t{$src, $dst|$dst, $src}",
    859                    [(store (v8f32 VR256:$src), addr:$dst)],
    860                    IIC_SSE_MOVU_P_MR>, VEX, VEX_L;
    861 def VMOVUPDYmr : VPDI<0x11, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src),
    862                    "movupd\t{$src, $dst|$dst, $src}",
    863                    [(store (v4f64 VR256:$src), addr:$dst)],
    864                    IIC_SSE_MOVU_P_MR>, VEX, VEX_L;
    865 } // SchedRW
    866 
    867 // For disassembler
    868 let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0,
    869     SchedRW = [WriteFShuffle] in {
    870   def VMOVAPSrr_REV : VPSI<0x29, MRMDestReg, (outs VR128:$dst),
    871                           (ins VR128:$src),
    872                           "movaps\t{$src, $dst|$dst, $src}", [],
    873                           IIC_SSE_MOVA_P_RR>, VEX;
    874   def VMOVAPDrr_REV : VPDI<0x29, MRMDestReg, (outs VR128:$dst),
    875                            (ins VR128:$src),
    876                            "movapd\t{$src, $dst|$dst, $src}", [],
    877                            IIC_SSE_MOVA_P_RR>, VEX;
    878   def VMOVUPSrr_REV : VPSI<0x11, MRMDestReg, (outs VR128:$dst),
    879                            (ins VR128:$src),
    880                            "movups\t{$src, $dst|$dst, $src}", [],
    881                            IIC_SSE_MOVU_P_RR>, VEX;
    882   def VMOVUPDrr_REV : VPDI<0x11, MRMDestReg, (outs VR128:$dst),
    883                            (ins VR128:$src),
    884                            "movupd\t{$src, $dst|$dst, $src}", [],
    885                            IIC_SSE_MOVU_P_RR>, VEX;
    886   def VMOVAPSYrr_REV : VPSI<0x29, MRMDestReg, (outs VR256:$dst),
    887                             (ins VR256:$src),
    888                             "movaps\t{$src, $dst|$dst, $src}", [],
    889                             IIC_SSE_MOVA_P_RR>, VEX, VEX_L;
    890   def VMOVAPDYrr_REV : VPDI<0x29, MRMDestReg, (outs VR256:$dst),
    891                             (ins VR256:$src),
    892                             "movapd\t{$src, $dst|$dst, $src}", [],
    893                             IIC_SSE_MOVA_P_RR>, VEX, VEX_L;
    894   def VMOVUPSYrr_REV : VPSI<0x11, MRMDestReg, (outs VR256:$dst),
    895                             (ins VR256:$src),
    896                             "movups\t{$src, $dst|$dst, $src}", [],
    897                             IIC_SSE_MOVU_P_RR>, VEX, VEX_L;
    898   def VMOVUPDYrr_REV : VPDI<0x11, MRMDestReg, (outs VR256:$dst),
    899                             (ins VR256:$src),
    900                             "movupd\t{$src, $dst|$dst, $src}", [],
    901                             IIC_SSE_MOVU_P_RR>, VEX, VEX_L;
    902 }
    903 
    904 // Aliases to help the assembler pick two byte VEX encodings by swapping the
    905 // operands relative to the normal instructions to use VEX.R instead of VEX.B.
    906 def : InstAlias<"vmovaps\t{$src, $dst|$dst, $src}",
    907                 (VMOVAPSrr_REV VR128L:$dst, VR128H:$src), 0>;
    908 def : InstAlias<"vmovapd\t{$src, $dst|$dst, $src}",
    909                 (VMOVAPDrr_REV VR128L:$dst, VR128H:$src), 0>;
    910 def : InstAlias<"vmovups\t{$src, $dst|$dst, $src}",
    911                 (VMOVUPSrr_REV VR128L:$dst, VR128H:$src), 0>;
    912 def : InstAlias<"vmovupd\t{$src, $dst|$dst, $src}",
    913                 (VMOVUPDrr_REV VR128L:$dst, VR128H:$src), 0>;
    914 def : InstAlias<"vmovaps\t{$src, $dst|$dst, $src}",
    915                 (VMOVAPSYrr_REV VR256L:$dst, VR256H:$src), 0>;
    916 def : InstAlias<"vmovapd\t{$src, $dst|$dst, $src}",
    917                 (VMOVAPDYrr_REV VR256L:$dst, VR256H:$src), 0>;
    918 def : InstAlias<"vmovups\t{$src, $dst|$dst, $src}",
    919                 (VMOVUPSYrr_REV VR256L:$dst, VR256H:$src), 0>;
    920 def : InstAlias<"vmovupd\t{$src, $dst|$dst, $src}",
    921                 (VMOVUPDYrr_REV VR256L:$dst, VR256H:$src), 0>;
    922 
    923 let SchedRW = [WriteStore] in {
    924 def MOVAPSmr : PSI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
    925                    "movaps\t{$src, $dst|$dst, $src}",
    926                    [(alignedstore (v4f32 VR128:$src), addr:$dst)],
    927                    IIC_SSE_MOVA_P_MR>;
    928 def MOVAPDmr : PDI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
    929                    "movapd\t{$src, $dst|$dst, $src}",
    930                    [(alignedstore (v2f64 VR128:$src), addr:$dst)],
    931                    IIC_SSE_MOVA_P_MR>;
    932 def MOVUPSmr : PSI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
    933                    "movups\t{$src, $dst|$dst, $src}",
    934                    [(store (v4f32 VR128:$src), addr:$dst)],
    935                    IIC_SSE_MOVU_P_MR>;
    936 def MOVUPDmr : PDI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
    937                    "movupd\t{$src, $dst|$dst, $src}",
    938                    [(store (v2f64 VR128:$src), addr:$dst)],
    939                    IIC_SSE_MOVU_P_MR>;
    940 } // SchedRW
    941 
    942 // For disassembler
    943 let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0,
    944     SchedRW = [WriteFShuffle] in {
    945   def MOVAPSrr_REV : PSI<0x29, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
    946                          "movaps\t{$src, $dst|$dst, $src}", [],
    947                          IIC_SSE_MOVA_P_RR>;
    948   def MOVAPDrr_REV : PDI<0x29, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
    949                          "movapd\t{$src, $dst|$dst, $src}", [],
    950                          IIC_SSE_MOVA_P_RR>;
    951   def MOVUPSrr_REV : PSI<0x11, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
    952                          "movups\t{$src, $dst|$dst, $src}", [],
    953                          IIC_SSE_MOVU_P_RR>;
    954   def MOVUPDrr_REV : PDI<0x11, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
    955                          "movupd\t{$src, $dst|$dst, $src}", [],
    956                          IIC_SSE_MOVU_P_RR>;
    957 }
    958 
    959 // Use vmovaps/vmovups for AVX integer load/store.
    960 let Predicates = [HasAVX, NoVLX] in {
    961   // 128-bit load/store
    962   def : Pat<(alignedloadv2i64 addr:$src),
    963             (VMOVAPSrm addr:$src)>;
    964   def : Pat<(loadv2i64 addr:$src),
    965             (VMOVUPSrm addr:$src)>;
    966 
    967   def : Pat<(alignedstore (v2i64 VR128:$src), addr:$dst),
    968             (VMOVAPSmr addr:$dst, VR128:$src)>;
    969   def : Pat<(alignedstore (v4i32 VR128:$src), addr:$dst),
    970             (VMOVAPSmr addr:$dst, VR128:$src)>;
    971   def : Pat<(store (v2i64 VR128:$src), addr:$dst),
    972             (VMOVUPSmr addr:$dst, VR128:$src)>;
    973   def : Pat<(store (v4i32 VR128:$src), addr:$dst),
    974             (VMOVUPSmr addr:$dst, VR128:$src)>;
    975 
    976   // 256-bit load/store
    977   def : Pat<(alignedloadv4i64 addr:$src),
    978             (VMOVAPSYrm addr:$src)>;
    979   def : Pat<(loadv4i64 addr:$src),
    980             (VMOVUPSYrm addr:$src)>;
    981   def : Pat<(alignedstore256 (v4i64 VR256:$src), addr:$dst),
    982             (VMOVAPSYmr addr:$dst, VR256:$src)>;
    983   def : Pat<(alignedstore256 (v8i32 VR256:$src), addr:$dst),
    984             (VMOVAPSYmr addr:$dst, VR256:$src)>;
    985   def : Pat<(store (v4i64 VR256:$src), addr:$dst),
    986             (VMOVUPSYmr addr:$dst, VR256:$src)>;
    987   def : Pat<(store (v8i32 VR256:$src), addr:$dst),
    988             (VMOVUPSYmr addr:$dst, VR256:$src)>;
    989 
    990   // Special patterns for storing subvector extracts of lower 128-bits
    991   // Its cheaper to just use VMOVAPS/VMOVUPS instead of VEXTRACTF128mr
    992   def : Pat<(alignedstore (v2f64 (extract_subvector
    993                                   (v4f64 VR256:$src), (iPTR 0))), addr:$dst),
    994             (VMOVAPDmr addr:$dst, (v2f64 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
    995   def : Pat<(alignedstore (v4f32 (extract_subvector
    996                                   (v8f32 VR256:$src), (iPTR 0))), addr:$dst),
    997             (VMOVAPSmr addr:$dst, (v4f32 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
    998   def : Pat<(alignedstore (v2i64 (extract_subvector
    999                                   (v4i64 VR256:$src), (iPTR 0))), addr:$dst),
   1000             (VMOVAPDmr addr:$dst, (v2i64 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
   1001   def : Pat<(alignedstore (v4i32 (extract_subvector
   1002                                   (v8i32 VR256:$src), (iPTR 0))), addr:$dst),
   1003             (VMOVAPSmr addr:$dst, (v4i32 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
   1004   def : Pat<(alignedstore (v8i16 (extract_subvector
   1005                                   (v16i16 VR256:$src), (iPTR 0))), addr:$dst),
   1006             (VMOVAPSmr addr:$dst, (v8i16 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
   1007   def : Pat<(alignedstore (v16i8 (extract_subvector
   1008                                   (v32i8 VR256:$src), (iPTR 0))), addr:$dst),
   1009             (VMOVAPSmr addr:$dst, (v16i8 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
   1010 
   1011   def : Pat<(store (v2f64 (extract_subvector
   1012                            (v4f64 VR256:$src), (iPTR 0))), addr:$dst),
   1013             (VMOVUPDmr addr:$dst, (v2f64 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
   1014   def : Pat<(store (v4f32 (extract_subvector
   1015                            (v8f32 VR256:$src), (iPTR 0))), addr:$dst),
   1016             (VMOVUPSmr addr:$dst, (v4f32 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
   1017   def : Pat<(store (v2i64 (extract_subvector
   1018                            (v4i64 VR256:$src), (iPTR 0))), addr:$dst),
   1019             (VMOVUPDmr addr:$dst, (v2i64 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
   1020   def : Pat<(store (v4i32 (extract_subvector
   1021                            (v8i32 VR256:$src), (iPTR 0))), addr:$dst),
   1022             (VMOVUPSmr addr:$dst, (v4i32 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
   1023   def : Pat<(store (v8i16 (extract_subvector
   1024                            (v16i16 VR256:$src), (iPTR 0))), addr:$dst),
   1025             (VMOVUPSmr addr:$dst, (v8i16 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
   1026   def : Pat<(store (v16i8 (extract_subvector
   1027                            (v32i8 VR256:$src), (iPTR 0))), addr:$dst),
   1028             (VMOVUPSmr addr:$dst, (v16i8 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
   1029 }
   1030 
   1031 let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
   1032   // 128-bit load/store
   1033   def : Pat<(alignedstore (v8i16 VR128:$src), addr:$dst),
   1034             (VMOVAPSmr addr:$dst, VR128:$src)>;
   1035   def : Pat<(alignedstore (v16i8 VR128:$src), addr:$dst),
   1036             (VMOVAPSmr addr:$dst, VR128:$src)>;
   1037   def : Pat<(store (v8i16 VR128:$src), addr:$dst),
   1038             (VMOVUPSmr addr:$dst, VR128:$src)>;
   1039   def : Pat<(store (v16i8 VR128:$src), addr:$dst),
   1040             (VMOVUPSmr addr:$dst, VR128:$src)>;
   1041 
   1042   // 256-bit load/store
   1043   def : Pat<(alignedstore256 (v16i16 VR256:$src), addr:$dst),
   1044             (VMOVAPSYmr addr:$dst, VR256:$src)>;
   1045   def : Pat<(alignedstore256 (v32i8 VR256:$src), addr:$dst),
   1046             (VMOVAPSYmr addr:$dst, VR256:$src)>;
   1047   def : Pat<(store (v16i16 VR256:$src), addr:$dst),
   1048             (VMOVUPSYmr addr:$dst, VR256:$src)>;
   1049   def : Pat<(store (v32i8 VR256:$src), addr:$dst),
   1050             (VMOVUPSYmr addr:$dst, VR256:$src)>;
   1051 }
   1052 
   1053 // Use movaps / movups for SSE integer load / store (one byte shorter).
   1054 // The instructions selected below are then converted to MOVDQA/MOVDQU
   1055 // during the SSE domain pass.
   1056 let Predicates = [UseSSE1] in {
   1057   def : Pat<(alignedloadv2i64 addr:$src),
   1058             (MOVAPSrm addr:$src)>;
   1059   def : Pat<(loadv2i64 addr:$src),
   1060             (MOVUPSrm addr:$src)>;
   1061 
   1062   def : Pat<(alignedstore (v2i64 VR128:$src), addr:$dst),
   1063             (MOVAPSmr addr:$dst, VR128:$src)>;
   1064   def : Pat<(alignedstore (v4i32 VR128:$src), addr:$dst),
   1065             (MOVAPSmr addr:$dst, VR128:$src)>;
   1066   def : Pat<(alignedstore (v8i16 VR128:$src), addr:$dst),
   1067             (MOVAPSmr addr:$dst, VR128:$src)>;
   1068   def : Pat<(alignedstore (v16i8 VR128:$src), addr:$dst),
   1069             (MOVAPSmr addr:$dst, VR128:$src)>;
   1070   def : Pat<(store (v2i64 VR128:$src), addr:$dst),
   1071             (MOVUPSmr addr:$dst, VR128:$src)>;
   1072   def : Pat<(store (v4i32 VR128:$src), addr:$dst),
   1073             (MOVUPSmr addr:$dst, VR128:$src)>;
   1074   def : Pat<(store (v8i16 VR128:$src), addr:$dst),
   1075             (MOVUPSmr addr:$dst, VR128:$src)>;
   1076   def : Pat<(store (v16i8 VR128:$src), addr:$dst),
   1077             (MOVUPSmr addr:$dst, VR128:$src)>;
   1078 }
   1079 
   1080 // Alias instruction to load FR32 or FR64 from f128mem using movaps. Upper
   1081 // bits are disregarded. FIXME: Set encoding to pseudo!
   1082 let canFoldAsLoad = 1, isReMaterializable = 1, SchedRW = [WriteLoad] in {
   1083 let isCodeGenOnly = 1 in {
   1084   def FsVMOVAPSrm : VPSI<0x28, MRMSrcMem, (outs FR32:$dst), (ins f128mem:$src),
   1085                          "movaps\t{$src, $dst|$dst, $src}",
   1086                          [(set FR32:$dst, (alignedloadfsf32 addr:$src))],
   1087                          IIC_SSE_MOVA_P_RM>, VEX;
   1088   def FsVMOVAPDrm : VPDI<0x28, MRMSrcMem, (outs FR64:$dst), (ins f128mem:$src),
   1089                          "movapd\t{$src, $dst|$dst, $src}",
   1090                          [(set FR64:$dst, (alignedloadfsf64 addr:$src))],
   1091                          IIC_SSE_MOVA_P_RM>, VEX;
   1092   def FsMOVAPSrm : PSI<0x28, MRMSrcMem, (outs FR32:$dst), (ins f128mem:$src),
   1093                        "movaps\t{$src, $dst|$dst, $src}",
   1094                        [(set FR32:$dst, (alignedloadfsf32 addr:$src))],
   1095                        IIC_SSE_MOVA_P_RM>;
   1096   def FsMOVAPDrm : PDI<0x28, MRMSrcMem, (outs FR64:$dst), (ins f128mem:$src),
   1097                        "movapd\t{$src, $dst|$dst, $src}",
   1098                        [(set FR64:$dst, (alignedloadfsf64 addr:$src))],
   1099                        IIC_SSE_MOVA_P_RM>;
   1100 }
   1101 }
   1102 
   1103 //===----------------------------------------------------------------------===//
   1104 // SSE 1 & 2 - Move Low packed FP Instructions
   1105 //===----------------------------------------------------------------------===//
   1106 
   1107 multiclass sse12_mov_hilo_packed_base<bits<8>opc, SDNode psnode, SDNode pdnode,
   1108                                       string base_opc, string asm_opr,
   1109                                       InstrItinClass itin> {
   1110   def PSrm : PI<opc, MRMSrcMem,
   1111          (outs VR128:$dst), (ins VR128:$src1, f64mem:$src2),
   1112          !strconcat(base_opc, "s", asm_opr),
   1113      [(set VR128:$dst,
   1114        (psnode VR128:$src1,
   1115               (bc_v4f32 (v2f64 (scalar_to_vector (loadf64 addr:$src2))))))],
   1116               itin, SSEPackedSingle>, PS,
   1117      Sched<[WriteFShuffleLd, ReadAfterLd]>;
   1118 
   1119   def PDrm : PI<opc, MRMSrcMem,
   1120          (outs VR128:$dst), (ins VR128:$src1, f64mem:$src2),
   1121          !strconcat(base_opc, "d", asm_opr),
   1122      [(set VR128:$dst, (v2f64 (pdnode VR128:$src1,
   1123                               (scalar_to_vector (loadf64 addr:$src2)))))],
   1124               itin, SSEPackedDouble>, PD,
   1125      Sched<[WriteFShuffleLd, ReadAfterLd]>;
   1126 
   1127 }
   1128 
   1129 multiclass sse12_mov_hilo_packed<bits<8>opc, SDNode psnode, SDNode pdnode,
   1130                                  string base_opc, InstrItinClass itin> {
   1131   let Predicates = [UseAVX] in
   1132     defm V#NAME : sse12_mov_hilo_packed_base<opc, psnode, pdnode, base_opc,
   1133                                     "\t{$src2, $src1, $dst|$dst, $src1, $src2}",
   1134                                     itin>, VEX_4V;
   1135 
   1136   let Constraints = "$src1 = $dst" in
   1137     defm NAME : sse12_mov_hilo_packed_base<opc, psnode, pdnode, base_opc,
   1138                                     "\t{$src2, $dst|$dst, $src2}",
   1139                                     itin>;
   1140 }
   1141 
   1142 let AddedComplexity = 20 in {
   1143   defm MOVL : sse12_mov_hilo_packed<0x12, X86Movlps, X86Movlpd, "movlp",
   1144                                     IIC_SSE_MOV_LH>;
   1145 }
   1146 
   1147 let SchedRW = [WriteStore] in {
   1148 let Predicates = [UseAVX] in {
   1149 def VMOVLPSmr : VPSI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
   1150                    "movlps\t{$src, $dst|$dst, $src}",
   1151                    [(store (f64 (extractelt (bc_v2f64 (v4f32 VR128:$src)),
   1152                                  (iPTR 0))), addr:$dst)],
   1153                                  IIC_SSE_MOV_LH>, VEX;
   1154 def VMOVLPDmr : VPDI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
   1155                    "movlpd\t{$src, $dst|$dst, $src}",
   1156                    [(store (f64 (extractelt (v2f64 VR128:$src),
   1157                                  (iPTR 0))), addr:$dst)],
   1158                                  IIC_SSE_MOV_LH>, VEX;
   1159 }// UseAVX
   1160 def MOVLPSmr : PSI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
   1161                    "movlps\t{$src, $dst|$dst, $src}",
   1162                    [(store (f64 (extractelt (bc_v2f64 (v4f32 VR128:$src)),
   1163                                  (iPTR 0))), addr:$dst)],
   1164                                  IIC_SSE_MOV_LH>;
   1165 def MOVLPDmr : PDI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
   1166                    "movlpd\t{$src, $dst|$dst, $src}",
   1167                    [(store (f64 (extractelt (v2f64 VR128:$src),
   1168                                  (iPTR 0))), addr:$dst)],
   1169                                  IIC_SSE_MOV_LH>;
   1170 } // SchedRW
   1171 
   1172 let Predicates = [UseAVX] in {
   1173   // Shuffle with VMOVLPS
   1174   def : Pat<(v4f32 (X86Movlps VR128:$src1, (load addr:$src2))),
   1175             (VMOVLPSrm VR128:$src1, addr:$src2)>;
   1176   def : Pat<(v4i32 (X86Movlps VR128:$src1, (load addr:$src2))),
   1177             (VMOVLPSrm VR128:$src1, addr:$src2)>;
   1178 
   1179   // Shuffle with VMOVLPD
   1180   def : Pat<(v2f64 (X86Movlpd VR128:$src1, (load addr:$src2))),
   1181             (VMOVLPDrm VR128:$src1, addr:$src2)>;
   1182   def : Pat<(v2i64 (X86Movlpd VR128:$src1, (load addr:$src2))),
   1183             (VMOVLPDrm VR128:$src1, addr:$src2)>;
   1184   def : Pat<(v2f64 (X86Movsd VR128:$src1,
   1185                              (v2f64 (scalar_to_vector (loadf64 addr:$src2))))),
   1186             (VMOVLPDrm VR128:$src1, addr:$src2)>;
   1187 
   1188   // Store patterns
   1189   def : Pat<(store (v4f32 (X86Movlps (load addr:$src1), VR128:$src2)),
   1190                    addr:$src1),
   1191             (VMOVLPSmr addr:$src1, VR128:$src2)>;
   1192   def : Pat<(store (v4i32 (X86Movlps
   1193                    (bc_v4i32 (loadv2i64 addr:$src1)), VR128:$src2)), addr:$src1),
   1194             (VMOVLPSmr addr:$src1, VR128:$src2)>;
   1195   def : Pat<(store (v2f64 (X86Movlpd (load addr:$src1), VR128:$src2)),
   1196                    addr:$src1),
   1197             (VMOVLPDmr addr:$src1, VR128:$src2)>;
   1198   def : Pat<(store (v2i64 (X86Movlpd (load addr:$src1), VR128:$src2)),
   1199                    addr:$src1),
   1200             (VMOVLPDmr addr:$src1, VR128:$src2)>;
   1201 }
   1202 
   1203 let Predicates = [UseSSE1] in {
   1204   // (store (vector_shuffle (load addr), v2, <4, 5, 2, 3>), addr) using MOVLPS
   1205   def : Pat<(store (i64 (extractelt (bc_v2i64 (v4f32 VR128:$src2)),
   1206                                  (iPTR 0))), addr:$src1),
   1207             (MOVLPSmr addr:$src1, VR128:$src2)>;
   1208 
   1209   // Shuffle with MOVLPS
   1210   def : Pat<(v4f32 (X86Movlps VR128:$src1, (load addr:$src2))),
   1211             (MOVLPSrm VR128:$src1, addr:$src2)>;
   1212   def : Pat<(v4i32 (X86Movlps VR128:$src1, (load addr:$src2))),
   1213             (MOVLPSrm VR128:$src1, addr:$src2)>;
   1214   def : Pat<(X86Movlps VR128:$src1,
   1215                       (bc_v4f32 (v2i64 (scalar_to_vector (loadi64 addr:$src2))))),
   1216             (MOVLPSrm VR128:$src1, addr:$src2)>;
   1217 
   1218   // Store patterns
   1219   def : Pat<(store (v4f32 (X86Movlps (load addr:$src1), VR128:$src2)),
   1220                                       addr:$src1),
   1221             (MOVLPSmr addr:$src1, VR128:$src2)>;
   1222   def : Pat<(store (v4i32 (X86Movlps
   1223                    (bc_v4i32 (loadv2i64 addr:$src1)), VR128:$src2)),
   1224                               addr:$src1),
   1225             (MOVLPSmr addr:$src1, VR128:$src2)>;
   1226 }
   1227 
   1228 let Predicates = [UseSSE2] in {
   1229   // Shuffle with MOVLPD
   1230   def : Pat<(v2f64 (X86Movlpd VR128:$src1, (load addr:$src2))),
   1231             (MOVLPDrm VR128:$src1, addr:$src2)>;
   1232   def : Pat<(v2i64 (X86Movlpd VR128:$src1, (load addr:$src2))),
   1233             (MOVLPDrm VR128:$src1, addr:$src2)>;
   1234   def : Pat<(v2f64 (X86Movsd VR128:$src1,
   1235                              (v2f64 (scalar_to_vector (loadf64 addr:$src2))))),
   1236             (MOVLPDrm VR128:$src1, addr:$src2)>;
   1237 
   1238   // Store patterns
   1239   def : Pat<(store (v2f64 (X86Movlpd (load addr:$src1), VR128:$src2)),
   1240                            addr:$src1),
   1241             (MOVLPDmr addr:$src1, VR128:$src2)>;
   1242   def : Pat<(store (v2i64 (X86Movlpd (load addr:$src1), VR128:$src2)),
   1243                            addr:$src1),
   1244             (MOVLPDmr addr:$src1, VR128:$src2)>;
   1245 }
   1246 
   1247 //===----------------------------------------------------------------------===//
   1248 // SSE 1 & 2 - Move Hi packed FP Instructions
   1249 //===----------------------------------------------------------------------===//
   1250 
   1251 let AddedComplexity = 20 in {
   1252   defm MOVH : sse12_mov_hilo_packed<0x16, X86Movlhps, X86Movlhpd, "movhp",
   1253                                     IIC_SSE_MOV_LH>;
   1254 }
   1255 
   1256 let SchedRW = [WriteStore] in {
   1257 // v2f64 extract element 1 is always custom lowered to unpack high to low
   1258 // and extract element 0 so the non-store version isn't too horrible.
   1259 let Predicates = [UseAVX] in {
   1260 def VMOVHPSmr : VPSI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
   1261                    "movhps\t{$src, $dst|$dst, $src}",
   1262                    [(store (f64 (extractelt
   1263                                  (X86Unpckh (bc_v2f64 (v4f32 VR128:$src)),
   1264                                             (bc_v2f64 (v4f32 VR128:$src))),
   1265                                  (iPTR 0))), addr:$dst)], IIC_SSE_MOV_LH>, VEX;
   1266 def VMOVHPDmr : VPDI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
   1267                    "movhpd\t{$src, $dst|$dst, $src}",
   1268                    [(store (f64 (extractelt
   1269                                  (v2f64 (X86Unpckh VR128:$src, VR128:$src)),
   1270                                  (iPTR 0))), addr:$dst)], IIC_SSE_MOV_LH>, VEX;
   1271 } // UseAVX
   1272 def MOVHPSmr : PSI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
   1273                    "movhps\t{$src, $dst|$dst, $src}",
   1274                    [(store (f64 (extractelt
   1275                                  (X86Unpckh (bc_v2f64 (v4f32 VR128:$src)),
   1276                                             (bc_v2f64 (v4f32 VR128:$src))),
   1277                                  (iPTR 0))), addr:$dst)], IIC_SSE_MOV_LH>;
   1278 def MOVHPDmr : PDI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
   1279                    "movhpd\t{$src, $dst|$dst, $src}",
   1280                    [(store (f64 (extractelt
   1281                                  (v2f64 (X86Unpckh VR128:$src, VR128:$src)),
   1282                                  (iPTR 0))), addr:$dst)], IIC_SSE_MOV_LH>;
   1283 } // SchedRW
   1284 
   1285 let Predicates = [UseAVX] in {
   1286   // VMOVHPS patterns
   1287   def : Pat<(X86Movlhps VR128:$src1,
   1288                  (bc_v4f32 (v2i64 (scalar_to_vector (loadi64 addr:$src2))))),
   1289             (VMOVHPSrm VR128:$src1, addr:$src2)>;
   1290   def : Pat<(X86Movlhps VR128:$src1,
   1291                  (bc_v4i32 (v2i64 (X86vzload addr:$src2)))),
   1292             (VMOVHPSrm VR128:$src1, addr:$src2)>;
   1293 
   1294   // VMOVHPD patterns
   1295 
   1296   // FIXME: Instead of X86Unpckl, there should be a X86Movlhpd here, the problem
   1297   // is during lowering, where it's not possible to recognize the load fold
   1298   // cause it has two uses through a bitcast. One use disappears at isel time
   1299   // and the fold opportunity reappears.
   1300   def : Pat<(v2f64 (X86Unpckl VR128:$src1,
   1301                       (scalar_to_vector (loadf64 addr:$src2)))),
   1302             (VMOVHPDrm VR128:$src1, addr:$src2)>;
   1303   // Also handle an i64 load because that may get selected as a faster way to
   1304   // load the data.
   1305   def : Pat<(v2f64 (X86Unpckl VR128:$src1,
   1306                       (bc_v2f64 (v2i64 (scalar_to_vector (loadi64 addr:$src2)))))),
   1307             (VMOVHPDrm VR128:$src1, addr:$src2)>;
   1308 
   1309   def : Pat<(store (f64 (extractelt
   1310                           (v2f64 (X86VPermilpi VR128:$src, (i8 1))),
   1311                           (iPTR 0))), addr:$dst),
   1312             (VMOVHPDmr addr:$dst, VR128:$src)>;
   1313 }
   1314 
   1315 let Predicates = [UseSSE1] in {
   1316   // MOVHPS patterns
   1317   def : Pat<(X86Movlhps VR128:$src1,
   1318                  (bc_v4f32 (v2i64 (scalar_to_vector (loadi64 addr:$src2))))),
   1319             (MOVHPSrm VR128:$src1, addr:$src2)>;
   1320   def : Pat<(X86Movlhps VR128:$src1,
   1321                  (bc_v4f32 (v2i64 (X86vzload addr:$src2)))),
   1322             (MOVHPSrm VR128:$src1, addr:$src2)>;
   1323 }
   1324 
   1325 let Predicates = [UseSSE2] in {
   1326   // MOVHPD patterns
   1327 
   1328   // FIXME: Instead of X86Unpckl, there should be a X86Movlhpd here, the problem
   1329   // is during lowering, where it's not possible to recognize the load fold
   1330   // cause it has two uses through a bitcast. One use disappears at isel time
   1331   // and the fold opportunity reappears.
   1332   def : Pat<(v2f64 (X86Unpckl VR128:$src1,
   1333                       (scalar_to_vector (loadf64 addr:$src2)))),
   1334             (MOVHPDrm VR128:$src1, addr:$src2)>;
   1335   // Also handle an i64 load because that may get selected as a faster way to
   1336   // load the data.
   1337   def : Pat<(v2f64 (X86Unpckl VR128:$src1,
   1338                       (bc_v2f64 (v2i64 (scalar_to_vector (loadi64 addr:$src2)))))),
   1339             (MOVHPDrm VR128:$src1, addr:$src2)>;
   1340 
   1341   def : Pat<(store (f64 (extractelt
   1342                           (v2f64 (X86Shufp VR128:$src, VR128:$src, (i8 1))),
   1343                           (iPTR 0))), addr:$dst),
   1344             (MOVHPDmr addr:$dst, VR128:$src)>;
   1345 }
   1346 
   1347 //===----------------------------------------------------------------------===//
   1348 // SSE 1 & 2 - Move Low to High and High to Low packed FP Instructions
   1349 //===----------------------------------------------------------------------===//
   1350 
   1351 let AddedComplexity = 20, Predicates = [UseAVX] in {
   1352   def VMOVLHPSrr : VPSI<0x16, MRMSrcReg, (outs VR128:$dst),
   1353                                        (ins VR128:$src1, VR128:$src2),
   1354                       "movlhps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
   1355                       [(set VR128:$dst,
   1356                         (v4f32 (X86Movlhps VR128:$src1, VR128:$src2)))],
   1357                         IIC_SSE_MOV_LH>,
   1358                       VEX_4V, Sched<[WriteFShuffle]>;
   1359   def VMOVHLPSrr : VPSI<0x12, MRMSrcReg, (outs VR128:$dst),
   1360                                        (ins VR128:$src1, VR128:$src2),
   1361                       "movhlps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
   1362                       [(set VR128:$dst,
   1363                         (v4f32 (X86Movhlps VR128:$src1, VR128:$src2)))],
   1364                         IIC_SSE_MOV_LH>,
   1365                       VEX_4V, Sched<[WriteFShuffle]>;
   1366 }
   1367 let Constraints = "$src1 = $dst", AddedComplexity = 20 in {
   1368   def MOVLHPSrr : PSI<0x16, MRMSrcReg, (outs VR128:$dst),
   1369                                        (ins VR128:$src1, VR128:$src2),
   1370                       "movlhps\t{$src2, $dst|$dst, $src2}",
   1371                       [(set VR128:$dst,
   1372                         (v4f32 (X86Movlhps VR128:$src1, VR128:$src2)))],
   1373                         IIC_SSE_MOV_LH>, Sched<[WriteFShuffle]>;
   1374   def MOVHLPSrr : PSI<0x12, MRMSrcReg, (outs VR128:$dst),
   1375                                        (ins VR128:$src1, VR128:$src2),
   1376                       "movhlps\t{$src2, $dst|$dst, $src2}",
   1377                       [(set VR128:$dst,
   1378                         (v4f32 (X86Movhlps VR128:$src1, VR128:$src2)))],
   1379                         IIC_SSE_MOV_LH>, Sched<[WriteFShuffle]>;
   1380 }
   1381 
   1382 let Predicates = [UseAVX] in {
   1383   // MOVLHPS patterns
   1384   def : Pat<(v4i32 (X86Movlhps VR128:$src1, VR128:$src2)),
   1385             (VMOVLHPSrr VR128:$src1, VR128:$src2)>;
   1386   def : Pat<(v2i64 (X86Movlhps VR128:$src1, VR128:$src2)),
   1387             (VMOVLHPSrr (v2i64 VR128:$src1), VR128:$src2)>;
   1388 
   1389   // MOVHLPS patterns
   1390   def : Pat<(v4i32 (X86Movhlps VR128:$src1, VR128:$src2)),
   1391             (VMOVHLPSrr VR128:$src1, VR128:$src2)>;
   1392 }
   1393 
   1394 let Predicates = [UseSSE1] in {
   1395   // MOVLHPS patterns
   1396   def : Pat<(v4i32 (X86Movlhps VR128:$src1, VR128:$src2)),
   1397             (MOVLHPSrr VR128:$src1, VR128:$src2)>;
   1398   def : Pat<(v2i64 (X86Movlhps VR128:$src1, VR128:$src2)),
   1399             (MOVLHPSrr (v2i64 VR128:$src1), VR128:$src2)>;
   1400 
   1401   // MOVHLPS patterns
   1402   def : Pat<(v4i32 (X86Movhlps VR128:$src1, VR128:$src2)),
   1403             (MOVHLPSrr VR128:$src1, VR128:$src2)>;
   1404 }
   1405 
   1406 //===----------------------------------------------------------------------===//
   1407 // SSE 1 & 2 - Conversion Instructions
   1408 //===----------------------------------------------------------------------===//
   1409 
   1410 def SSE_CVT_PD : OpndItins<
   1411   IIC_SSE_CVT_PD_RR, IIC_SSE_CVT_PD_RM
   1412 >;
   1413 
   1414 let Sched = WriteCvtI2F in
   1415 def SSE_CVT_PS : OpndItins<
   1416   IIC_SSE_CVT_PS_RR, IIC_SSE_CVT_PS_RM
   1417 >;
   1418 
   1419 let Sched = WriteCvtI2F in
   1420 def SSE_CVT_Scalar : OpndItins<
   1421   IIC_SSE_CVT_Scalar_RR, IIC_SSE_CVT_Scalar_RM
   1422 >;
   1423 
   1424 let Sched = WriteCvtF2I in
   1425 def SSE_CVT_SS2SI_32 : OpndItins<
   1426   IIC_SSE_CVT_SS2SI32_RR, IIC_SSE_CVT_SS2SI32_RM
   1427 >;
   1428 
   1429 let Sched = WriteCvtF2I in
   1430 def SSE_CVT_SS2SI_64 : OpndItins<
   1431   IIC_SSE_CVT_SS2SI64_RR, IIC_SSE_CVT_SS2SI64_RM
   1432 >;
   1433 
   1434 let Sched = WriteCvtF2I in
   1435 def SSE_CVT_SD2SI : OpndItins<
   1436   IIC_SSE_CVT_SD2SI_RR, IIC_SSE_CVT_SD2SI_RM
   1437 >;
   1438 
   1439 // FIXME: We probably want to match the rm form only when optimizing for
   1440 // size, to avoid false depenendecies (see sse_fp_unop_s for details)
   1441 multiclass sse12_cvt_s<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC,
   1442                      SDNode OpNode, X86MemOperand x86memop, PatFrag ld_frag,
   1443                      string asm, OpndItins itins> {
   1444   def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src), asm,
   1445                         [(set DstRC:$dst, (OpNode SrcRC:$src))],
   1446                         itins.rr>, Sched<[itins.Sched]>;
   1447   def rm : SI<opc, MRMSrcMem, (outs DstRC:$dst), (ins x86memop:$src), asm,
   1448                         [(set DstRC:$dst, (OpNode (ld_frag addr:$src)))],
   1449                         itins.rm>, Sched<[itins.Sched.Folded]>;
   1450 }
   1451 
   1452 multiclass sse12_cvt_p<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC,
   1453                        X86MemOperand x86memop, string asm, Domain d,
   1454                        OpndItins itins> {
   1455 let hasSideEffects = 0 in {
   1456   def rr : I<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src), asm,
   1457              [], itins.rr, d>, Sched<[itins.Sched]>;
   1458   let mayLoad = 1 in
   1459   def rm : I<opc, MRMSrcMem, (outs DstRC:$dst), (ins x86memop:$src), asm,
   1460              [], itins.rm, d>, Sched<[itins.Sched.Folded]>;
   1461 }
   1462 }
   1463 
   1464 // FIXME: We probably want to match the rm form only when optimizing for
   1465 // size, to avoid false depenendecies (see sse_fp_unop_s for details)
   1466 multiclass sse12_vcvt_avx<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC,
   1467                           X86MemOperand x86memop, string asm> {
   1468 let hasSideEffects = 0, Predicates = [UseAVX] in {
   1469   def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins DstRC:$src1, SrcRC:$src),
   1470               !strconcat(asm,"\t{$src, $src1, $dst|$dst, $src1, $src}"), []>,
   1471            Sched<[WriteCvtI2F]>;
   1472   let mayLoad = 1 in
   1473   def rm : SI<opc, MRMSrcMem, (outs DstRC:$dst),
   1474               (ins DstRC:$src1, x86memop:$src),
   1475               !strconcat(asm,"\t{$src, $src1, $dst|$dst, $src1, $src}"), []>,
   1476            Sched<[WriteCvtI2FLd, ReadAfterLd]>;
   1477 } // hasSideEffects = 0
   1478 }
   1479 
   1480 let Predicates = [UseAVX] in {
   1481 defm VCVTTSS2SI   : sse12_cvt_s<0x2C, FR32, GR32, fp_to_sint, f32mem, loadf32,
   1482                                 "cvttss2si\t{$src, $dst|$dst, $src}",
   1483                                 SSE_CVT_SS2SI_32>,
   1484                                 XS, VEX, VEX_LIG;
   1485 defm VCVTTSS2SI64 : sse12_cvt_s<0x2C, FR32, GR64, fp_to_sint, f32mem, loadf32,
   1486                                 "cvttss2si\t{$src, $dst|$dst, $src}",
   1487                                 SSE_CVT_SS2SI_64>,
   1488                                 XS, VEX, VEX_W, VEX_LIG;
   1489 defm VCVTTSD2SI   : sse12_cvt_s<0x2C, FR64, GR32, fp_to_sint, f64mem, loadf64,
   1490                                 "cvttsd2si\t{$src, $dst|$dst, $src}",
   1491                                 SSE_CVT_SD2SI>,
   1492                                 XD, VEX, VEX_LIG;
   1493 defm VCVTTSD2SI64 : sse12_cvt_s<0x2C, FR64, GR64, fp_to_sint, f64mem, loadf64,
   1494                                 "cvttsd2si\t{$src, $dst|$dst, $src}",
   1495                                 SSE_CVT_SD2SI>,
   1496                                 XD, VEX, VEX_W, VEX_LIG;
   1497 
   1498 def : InstAlias<"vcvttss2si{l}\t{$src, $dst|$dst, $src}",
   1499                 (VCVTTSS2SIrr GR32:$dst, FR32:$src), 0>;
   1500 def : InstAlias<"vcvttss2si{l}\t{$src, $dst|$dst, $src}",
   1501                 (VCVTTSS2SIrm GR32:$dst, f32mem:$src), 0>;
   1502 def : InstAlias<"vcvttsd2si{l}\t{$src, $dst|$dst, $src}",
   1503                 (VCVTTSD2SIrr GR32:$dst, FR64:$src), 0>;
   1504 def : InstAlias<"vcvttsd2si{l}\t{$src, $dst|$dst, $src}",
   1505                 (VCVTTSD2SIrm GR32:$dst, f64mem:$src), 0>;
   1506 def : InstAlias<"vcvttss2si{q}\t{$src, $dst|$dst, $src}",
   1507                 (VCVTTSS2SI64rr GR64:$dst, FR32:$src), 0>;
   1508 def : InstAlias<"vcvttss2si{q}\t{$src, $dst|$dst, $src}",
   1509                 (VCVTTSS2SI64rm GR64:$dst, f32mem:$src), 0>;
   1510 def : InstAlias<"vcvttsd2si{q}\t{$src, $dst|$dst, $src}",
   1511                 (VCVTTSD2SI64rr GR64:$dst, FR64:$src), 0>;
   1512 def : InstAlias<"vcvttsd2si{q}\t{$src, $dst|$dst, $src}",
   1513                 (VCVTTSD2SI64rm GR64:$dst, f64mem:$src), 0>;
   1514 }
   1515 // The assembler can recognize rr 64-bit instructions by seeing a rxx
   1516 // register, but the same isn't true when only using memory operands,
   1517 // provide other assembly "l" and "q" forms to address this explicitly
   1518 // where appropriate to do so.
   1519 defm VCVTSI2SS   : sse12_vcvt_avx<0x2A, GR32, FR32, i32mem, "cvtsi2ss{l}">,
   1520                                   XS, VEX_4V, VEX_LIG;
   1521 defm VCVTSI2SS64 : sse12_vcvt_avx<0x2A, GR64, FR32, i64mem, "cvtsi2ss{q}">,
   1522                                   XS, VEX_4V, VEX_W, VEX_LIG;
   1523 defm VCVTSI2SD   : sse12_vcvt_avx<0x2A, GR32, FR64, i32mem, "cvtsi2sd{l}">,
   1524                                   XD, VEX_4V, VEX_LIG;
   1525 defm VCVTSI2SD64 : sse12_vcvt_avx<0x2A, GR64, FR64, i64mem, "cvtsi2sd{q}">,
   1526                                   XD, VEX_4V, VEX_W, VEX_LIG;
   1527 
   1528 let Predicates = [UseAVX] in {
   1529   def : InstAlias<"vcvtsi2ss\t{$src, $src1, $dst|$dst, $src1, $src}",
   1530                 (VCVTSI2SSrm FR64:$dst, FR64:$src1, i32mem:$src), 0>;
   1531   def : InstAlias<"vcvtsi2sd\t{$src, $src1, $dst|$dst, $src1, $src}",
   1532                 (VCVTSI2SDrm FR64:$dst, FR64:$src1, i32mem:$src), 0>;
   1533 
   1534   def : Pat<(f32 (sint_to_fp (loadi32 addr:$src))),
   1535             (VCVTSI2SSrm (f32 (IMPLICIT_DEF)), addr:$src)>;
   1536   def : Pat<(f32 (sint_to_fp (loadi64 addr:$src))),
   1537             (VCVTSI2SS64rm (f32 (IMPLICIT_DEF)), addr:$src)>;
   1538   def : Pat<(f64 (sint_to_fp (loadi32 addr:$src))),
   1539             (VCVTSI2SDrm (f64 (IMPLICIT_DEF)), addr:$src)>;
   1540   def : Pat<(f64 (sint_to_fp (loadi64 addr:$src))),
   1541             (VCVTSI2SD64rm (f64 (IMPLICIT_DEF)), addr:$src)>;
   1542 
   1543   def : Pat<(f32 (sint_to_fp GR32:$src)),
   1544             (VCVTSI2SSrr (f32 (IMPLICIT_DEF)), GR32:$src)>;
   1545   def : Pat<(f32 (sint_to_fp GR64:$src)),
   1546             (VCVTSI2SS64rr (f32 (IMPLICIT_DEF)), GR64:$src)>;
   1547   def : Pat<(f64 (sint_to_fp GR32:$src)),
   1548             (VCVTSI2SDrr (f64 (IMPLICIT_DEF)), GR32:$src)>;
   1549   def : Pat<(f64 (sint_to_fp GR64:$src)),
   1550             (VCVTSI2SD64rr (f64 (IMPLICIT_DEF)), GR64:$src)>;
   1551 }
   1552 
   1553 defm CVTTSS2SI : sse12_cvt_s<0x2C, FR32, GR32, fp_to_sint, f32mem, loadf32,
   1554                       "cvttss2si\t{$src, $dst|$dst, $src}",
   1555                       SSE_CVT_SS2SI_32>, XS;
   1556 defm CVTTSS2SI64 : sse12_cvt_s<0x2C, FR32, GR64, fp_to_sint, f32mem, loadf32,
   1557                       "cvttss2si\t{$src, $dst|$dst, $src}",
   1558                       SSE_CVT_SS2SI_64>, XS, REX_W;
   1559 defm CVTTSD2SI : sse12_cvt_s<0x2C, FR64, GR32, fp_to_sint, f64mem, loadf64,
   1560                       "cvttsd2si\t{$src, $dst|$dst, $src}",
   1561                       SSE_CVT_SD2SI>, XD;
   1562 defm CVTTSD2SI64 : sse12_cvt_s<0x2C, FR64, GR64, fp_to_sint, f64mem, loadf64,
   1563                       "cvttsd2si\t{$src, $dst|$dst, $src}",
   1564                       SSE_CVT_SD2SI>, XD, REX_W;
   1565 defm CVTSI2SS  : sse12_cvt_s<0x2A, GR32, FR32, sint_to_fp, i32mem, loadi32,
   1566                       "cvtsi2ss{l}\t{$src, $dst|$dst, $src}",
   1567                       SSE_CVT_Scalar>, XS;
   1568 defm CVTSI2SS64 : sse12_cvt_s<0x2A, GR64, FR32, sint_to_fp, i64mem, loadi64,
   1569                       "cvtsi2ss{q}\t{$src, $dst|$dst, $src}",
   1570                       SSE_CVT_Scalar>, XS, REX_W;
   1571 defm CVTSI2SD  : sse12_cvt_s<0x2A, GR32, FR64, sint_to_fp, i32mem, loadi32,
   1572                       "cvtsi2sd{l}\t{$src, $dst|$dst, $src}",
   1573                       SSE_CVT_Scalar>, XD;
   1574 defm CVTSI2SD64 : sse12_cvt_s<0x2A, GR64, FR64, sint_to_fp, i64mem, loadi64,
   1575                       "cvtsi2sd{q}\t{$src, $dst|$dst, $src}",
   1576                       SSE_CVT_Scalar>, XD, REX_W;
   1577 
   1578 def : InstAlias<"cvttss2si{l}\t{$src, $dst|$dst, $src}",
   1579                 (CVTTSS2SIrr GR32:$dst, FR32:$src), 0>;
   1580 def : InstAlias<"cvttss2si{l}\t{$src, $dst|$dst, $src}",
   1581                 (CVTTSS2SIrm GR32:$dst, f32mem:$src), 0>;
   1582 def : InstAlias<"cvttsd2si{l}\t{$src, $dst|$dst, $src}",
   1583                 (CVTTSD2SIrr GR32:$dst, FR64:$src), 0>;
   1584 def : InstAlias<"cvttsd2si{l}\t{$src, $dst|$dst, $src}",
   1585                 (CVTTSD2SIrm GR32:$dst, f64mem:$src), 0>;
   1586 def : InstAlias<"cvttss2si{q}\t{$src, $dst|$dst, $src}",
   1587                 (CVTTSS2SI64rr GR64:$dst, FR32:$src), 0>;
   1588 def : InstAlias<"cvttss2si{q}\t{$src, $dst|$dst, $src}",
   1589                 (CVTTSS2SI64rm GR64:$dst, f32mem:$src), 0>;
   1590 def : InstAlias<"cvttsd2si{q}\t{$src, $dst|$dst, $src}",
   1591                 (CVTTSD2SI64rr GR64:$dst, FR64:$src), 0>;
   1592 def : InstAlias<"cvttsd2si{q}\t{$src, $dst|$dst, $src}",
   1593                 (CVTTSD2SI64rm GR64:$dst, f64mem:$src), 0>;
   1594 
   1595 def : InstAlias<"cvtsi2ss\t{$src, $dst|$dst, $src}",
   1596                 (CVTSI2SSrm FR64:$dst, i32mem:$src), 0>;
   1597 def : InstAlias<"cvtsi2sd\t{$src, $dst|$dst, $src}",
   1598                 (CVTSI2SDrm FR64:$dst, i32mem:$src), 0>;
   1599 
   1600 // Conversion Instructions Intrinsics - Match intrinsics which expect MM
   1601 // and/or XMM operand(s).
   1602 
   1603 // FIXME: We probably want to match the rm form only when optimizing for
   1604 // size, to avoid false depenendecies (see sse_fp_unop_s for details)
   1605 multiclass sse12_cvt_sint<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC,
   1606                          Intrinsic Int, Operand memop, ComplexPattern mem_cpat,
   1607                          string asm, OpndItins itins> {
   1608   def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src),
   1609               !strconcat(asm, "\t{$src, $dst|$dst, $src}"),
   1610               [(set DstRC:$dst, (Int SrcRC:$src))], itins.rr>,
   1611            Sched<[itins.Sched]>;
   1612   def rm : SI<opc, MRMSrcMem, (outs DstRC:$dst), (ins memop:$src),
   1613               !strconcat(asm, "\t{$src, $dst|$dst, $src}"),
   1614               [(set DstRC:$dst, (Int mem_cpat:$src))], itins.rm>,
   1615            Sched<[itins.Sched.Folded]>;
   1616 }
   1617 
   1618 multiclass sse12_cvt_sint_3addr<bits<8> opc, RegisterClass SrcRC,
   1619                     RegisterClass DstRC, Intrinsic Int, X86MemOperand x86memop,
   1620                     PatFrag ld_frag, string asm, OpndItins itins,
   1621                     bit Is2Addr = 1> {
   1622   def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins DstRC:$src1, SrcRC:$src2),
   1623               !if(Is2Addr,
   1624                   !strconcat(asm, "\t{$src2, $dst|$dst, $src2}"),
   1625                   !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
   1626               [(set DstRC:$dst, (Int DstRC:$src1, SrcRC:$src2))],
   1627               itins.rr>, Sched<[itins.Sched]>;
   1628   def rm : SI<opc, MRMSrcMem, (outs DstRC:$dst),
   1629               (ins DstRC:$src1, x86memop:$src2),
   1630               !if(Is2Addr,
   1631                   !strconcat(asm, "\t{$src2, $dst|$dst, $src2}"),
   1632                   !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
   1633               [(set DstRC:$dst, (Int DstRC:$src1, (ld_frag addr:$src2)))],
   1634               itins.rm>, Sched<[itins.Sched.Folded, ReadAfterLd]>;
   1635 }
   1636 
   1637 let Predicates = [UseAVX] in {
   1638 defm VCVTSD2SI : sse12_cvt_sint<0x2D, VR128, GR32,
   1639                   int_x86_sse2_cvtsd2si, sdmem, sse_load_f64, "cvtsd2si",
   1640                   SSE_CVT_SD2SI>, XD, VEX, VEX_LIG;
   1641 defm VCVTSD2SI64 : sse12_cvt_sint<0x2D, VR128, GR64,
   1642                     int_x86_sse2_cvtsd2si64, sdmem, sse_load_f64, "cvtsd2si",
   1643                     SSE_CVT_SD2SI>, XD, VEX, VEX_W, VEX_LIG;
   1644 }
   1645 defm CVTSD2SI : sse12_cvt_sint<0x2D, VR128, GR32, int_x86_sse2_cvtsd2si,
   1646                  sdmem, sse_load_f64, "cvtsd2si", SSE_CVT_SD2SI>, XD;
   1647 defm CVTSD2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, int_x86_sse2_cvtsd2si64,
   1648                    sdmem, sse_load_f64, "cvtsd2si", SSE_CVT_SD2SI>, XD, REX_W;
   1649 
   1650 
   1651 let isCodeGenOnly = 1 in {
   1652   let Predicates = [UseAVX] in {
   1653   defm Int_VCVTSI2SS : sse12_cvt_sint_3addr<0x2A, GR32, VR128,
   1654             int_x86_sse_cvtsi2ss, i32mem, loadi32, "cvtsi2ss{l}",
   1655             SSE_CVT_Scalar, 0>, XS, VEX_4V;
   1656   defm Int_VCVTSI2SS64 : sse12_cvt_sint_3addr<0x2A, GR64, VR128,
   1657             int_x86_sse_cvtsi642ss, i64mem, loadi64, "cvtsi2ss{q}",
   1658             SSE_CVT_Scalar, 0>, XS, VEX_4V,
   1659             VEX_W;
   1660   defm Int_VCVTSI2SD : sse12_cvt_sint_3addr<0x2A, GR32, VR128,
   1661             int_x86_sse2_cvtsi2sd, i32mem, loadi32, "cvtsi2sd{l}",
   1662             SSE_CVT_Scalar, 0>, XD, VEX_4V;
   1663   defm Int_VCVTSI2SD64 : sse12_cvt_sint_3addr<0x2A, GR64, VR128,
   1664             int_x86_sse2_cvtsi642sd, i64mem, loadi64, "cvtsi2sd{q}",
   1665             SSE_CVT_Scalar, 0>, XD,
   1666             VEX_4V, VEX_W;
   1667   }
   1668   let Constraints = "$src1 = $dst" in {
   1669     defm Int_CVTSI2SS : sse12_cvt_sint_3addr<0x2A, GR32, VR128,
   1670                           int_x86_sse_cvtsi2ss, i32mem, loadi32,
   1671                           "cvtsi2ss{l}", SSE_CVT_Scalar>, XS;
   1672     defm Int_CVTSI2SS64 : sse12_cvt_sint_3addr<0x2A, GR64, VR128,
   1673                           int_x86_sse_cvtsi642ss, i64mem, loadi64,
   1674                           "cvtsi2ss{q}", SSE_CVT_Scalar>, XS, REX_W;
   1675     defm Int_CVTSI2SD : sse12_cvt_sint_3addr<0x2A, GR32, VR128,
   1676                           int_x86_sse2_cvtsi2sd, i32mem, loadi32,
   1677                           "cvtsi2sd{l}", SSE_CVT_Scalar>, XD;
   1678     defm Int_CVTSI2SD64 : sse12_cvt_sint_3addr<0x2A, GR64, VR128,
   1679                           int_x86_sse2_cvtsi642sd, i64mem, loadi64,
   1680                           "cvtsi2sd{q}", SSE_CVT_Scalar>, XD, REX_W;
   1681   }
   1682 } // isCodeGenOnly = 1
   1683 
   1684 /// SSE 1 Only
   1685 
   1686 // Aliases for intrinsics
   1687 let isCodeGenOnly = 1 in {
   1688 let Predicates = [UseAVX] in {
   1689 defm Int_VCVTTSS2SI : sse12_cvt_sint<0x2C, VR128, GR32, int_x86_sse_cvttss2si,
   1690                                     ssmem, sse_load_f32, "cvttss2si",
   1691                                     SSE_CVT_SS2SI_32>, XS, VEX;
   1692 defm Int_VCVTTSS2SI64 : sse12_cvt_sint<0x2C, VR128, GR64,
   1693                                    int_x86_sse_cvttss2si64, ssmem, sse_load_f32,
   1694                                    "cvttss2si", SSE_CVT_SS2SI_64>,
   1695                                    XS, VEX, VEX_W;
   1696 defm Int_VCVTTSD2SI : sse12_cvt_sint<0x2C, VR128, GR32, int_x86_sse2_cvttsd2si,
   1697                                     sdmem, sse_load_f64, "cvttsd2si",
   1698                                     SSE_CVT_SD2SI>, XD, VEX;
   1699 defm Int_VCVTTSD2SI64 : sse12_cvt_sint<0x2C, VR128, GR64,
   1700                                   int_x86_sse2_cvttsd2si64, sdmem, sse_load_f64,
   1701                                   "cvttsd2si", SSE_CVT_SD2SI>,
   1702                                   XD, VEX, VEX_W;
   1703 }
   1704 defm Int_CVTTSS2SI : sse12_cvt_sint<0x2C, VR128, GR32, int_x86_sse_cvttss2si,
   1705                                     ssmem, sse_load_f32, "cvttss2si",
   1706                                     SSE_CVT_SS2SI_32>, XS;
   1707 defm Int_CVTTSS2SI64 : sse12_cvt_sint<0x2C, VR128, GR64,
   1708                                    int_x86_sse_cvttss2si64, ssmem, sse_load_f32,
   1709                                    "cvttss2si", SSE_CVT_SS2SI_64>, XS, REX_W;
   1710 defm Int_CVTTSD2SI : sse12_cvt_sint<0x2C, VR128, GR32, int_x86_sse2_cvttsd2si,
   1711                                     sdmem, sse_load_f64, "cvttsd2si",
   1712                                     SSE_CVT_SD2SI>, XD;
   1713 defm Int_CVTTSD2SI64 : sse12_cvt_sint<0x2C, VR128, GR64,
   1714                                   int_x86_sse2_cvttsd2si64, sdmem, sse_load_f64,
   1715                                   "cvttsd2si", SSE_CVT_SD2SI>, XD, REX_W;
   1716 } // isCodeGenOnly = 1
   1717 
   1718 let Predicates = [UseAVX] in {
   1719 defm VCVTSS2SI   : sse12_cvt_sint<0x2D, VR128, GR32, int_x86_sse_cvtss2si,
   1720                                   ssmem, sse_load_f32, "cvtss2si",
   1721                                   SSE_CVT_SS2SI_32>, XS, VEX, VEX_LIG;
   1722 defm VCVTSS2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, int_x86_sse_cvtss2si64,
   1723                                   ssmem, sse_load_f32, "cvtss2si",
   1724                                   SSE_CVT_SS2SI_64>, XS, VEX, VEX_W, VEX_LIG;
   1725 }
   1726 defm CVTSS2SI : sse12_cvt_sint<0x2D, VR128, GR32, int_x86_sse_cvtss2si,
   1727                                ssmem, sse_load_f32, "cvtss2si",
   1728                                SSE_CVT_SS2SI_32>, XS;
   1729 defm CVTSS2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, int_x86_sse_cvtss2si64,
   1730                                  ssmem, sse_load_f32, "cvtss2si",
   1731                                  SSE_CVT_SS2SI_64>, XS, REX_W;
   1732 
   1733 defm VCVTDQ2PS   : sse12_cvt_p<0x5B, VR128, VR128, i128mem,
   1734                                "vcvtdq2ps\t{$src, $dst|$dst, $src}",
   1735                                SSEPackedSingle, SSE_CVT_PS>,
   1736                                PS, VEX, Requires<[HasAVX]>;
   1737 defm VCVTDQ2PSY  : sse12_cvt_p<0x5B, VR256, VR256, i256mem,
   1738                                "vcvtdq2ps\t{$src, $dst|$dst, $src}",
   1739                                SSEPackedSingle, SSE_CVT_PS>,
   1740                                PS, VEX, VEX_L, Requires<[HasAVX]>;
   1741 
   1742 defm CVTDQ2PS : sse12_cvt_p<0x5B, VR128, VR128, i128mem,
   1743                             "cvtdq2ps\t{$src, $dst|$dst, $src}",
   1744                             SSEPackedSingle, SSE_CVT_PS>,
   1745                             PS, Requires<[UseSSE2]>;
   1746 
   1747 let Predicates = [UseAVX] in {
   1748 def : InstAlias<"vcvtss2si{l}\t{$src, $dst|$dst, $src}",
   1749                 (VCVTSS2SIrr GR32:$dst, VR128:$src), 0>;
   1750 def : InstAlias<"vcvtss2si{l}\t{$src, $dst|$dst, $src}",
   1751                 (VCVTSS2SIrm GR32:$dst, ssmem:$src), 0>;
   1752 def : InstAlias<"vcvtsd2si{l}\t{$src, $dst|$dst, $src}",
   1753                 (VCVTSD2SIrr GR32:$dst, VR128:$src), 0>;
   1754 def : InstAlias<"vcvtsd2si{l}\t{$src, $dst|$dst, $src}",
   1755                 (VCVTSD2SIrm GR32:$dst, sdmem:$src), 0>;
   1756 def : InstAlias<"vcvtss2si{q}\t{$src, $dst|$dst, $src}",
   1757                 (VCVTSS2SI64rr GR64:$dst, VR128:$src), 0>;
   1758 def : InstAlias<"vcvtss2si{q}\t{$src, $dst|$dst, $src}",
   1759                 (VCVTSS2SI64rm GR64:$dst, ssmem:$src), 0>;
   1760 def : InstAlias<"vcvtsd2si{q}\t{$src, $dst|$dst, $src}",
   1761                 (VCVTSD2SI64rr GR64:$dst, VR128:$src), 0>;
   1762 def : InstAlias<"vcvtsd2si{q}\t{$src, $dst|$dst, $src}",
   1763                 (VCVTSD2SI64rm GR64:$dst, sdmem:$src), 0>;
   1764 }
   1765 
   1766 def : InstAlias<"cvtss2si{l}\t{$src, $dst|$dst, $src}",
   1767                 (CVTSS2SIrr GR32:$dst, VR128:$src), 0>;
   1768 def : InstAlias<"cvtss2si{l}\t{$src, $dst|$dst, $src}",
   1769                 (CVTSS2SIrm GR32:$dst, ssmem:$src), 0>;
   1770 def : InstAlias<"cvtsd2si{l}\t{$src, $dst|$dst, $src}",
   1771                 (CVTSD2SIrr GR32:$dst, VR128:$src), 0>;
   1772 def : InstAlias<"cvtsd2si{l}\t{$src, $dst|$dst, $src}",
   1773                 (CVTSD2SIrm GR32:$dst, sdmem:$src), 0>;
   1774 def : InstAlias<"cvtss2si{q}\t{$src, $dst|$dst, $src}",
   1775                 (CVTSS2SI64rr GR64:$dst, VR128:$src), 0>;
   1776 def : InstAlias<"cvtss2si{q}\t{$src, $dst|$dst, $src}",
   1777                 (CVTSS2SI64rm GR64:$dst, ssmem:$src), 0>;
   1778 def : InstAlias<"cvtsd2si{q}\t{$src, $dst|$dst, $src}",
   1779                 (CVTSD2SI64rr GR64:$dst, VR128:$src), 0>;
   1780 def : InstAlias<"cvtsd2si{q}\t{$src, $dst|$dst, $src}",
   1781                 (CVTSD2SI64rm GR64:$dst, sdmem:$src), 0>;
   1782 
   1783 /// SSE 2 Only
   1784 
   1785 // Convert scalar double to scalar single
   1786 let hasSideEffects = 0, Predicates = [UseAVX] in {
   1787 def VCVTSD2SSrr  : VSDI<0x5A, MRMSrcReg, (outs FR32:$dst),
   1788                        (ins FR64:$src1, FR64:$src2),
   1789                       "cvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}", [],
   1790                       IIC_SSE_CVT_Scalar_RR>, VEX_4V, VEX_LIG,
   1791                       Sched<[WriteCvtF2F]>;
   1792 let mayLoad = 1 in
   1793 def VCVTSD2SSrm  : I<0x5A, MRMSrcMem, (outs FR32:$dst),
   1794                        (ins FR64:$src1, f64mem:$src2),
   1795                       "vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}",
   1796                       [], IIC_SSE_CVT_Scalar_RM>,
   1797                       XD, Requires<[HasAVX, OptForSize]>, VEX_4V, VEX_LIG,
   1798                       Sched<[WriteCvtF2FLd, ReadAfterLd]>;
   1799 }
   1800 
   1801 def : Pat<(f32 (fround FR64:$src)), (VCVTSD2SSrr FR64:$src, FR64:$src)>,
   1802           Requires<[UseAVX]>;
   1803 
   1804 def CVTSD2SSrr  : SDI<0x5A, MRMSrcReg, (outs FR32:$dst), (ins FR64:$src),
   1805                       "cvtsd2ss\t{$src, $dst|$dst, $src}",
   1806                       [(set FR32:$dst, (fround FR64:$src))],
   1807                       IIC_SSE_CVT_Scalar_RR>, Sched<[WriteCvtF2F]>;
   1808 def CVTSD2SSrm  : I<0x5A, MRMSrcMem, (outs FR32:$dst), (ins f64mem:$src),
   1809                       "cvtsd2ss\t{$src, $dst|$dst, $src}",
   1810                       [(set FR32:$dst, (fround (loadf64 addr:$src)))],
   1811                       IIC_SSE_CVT_Scalar_RM>,
   1812                       XD,
   1813                   Requires<[UseSSE2, OptForSize]>, Sched<[WriteCvtF2FLd]>;
   1814 
   1815 let isCodeGenOnly = 1 in {
   1816 def Int_VCVTSD2SSrr: I<0x5A, MRMSrcReg,
   1817                        (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
   1818                        "vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}",
   1819                        [(set VR128:$dst,
   1820                          (int_x86_sse2_cvtsd2ss VR128:$src1, VR128:$src2))],
   1821                        IIC_SSE_CVT_Scalar_RR>, XD, VEX_4V, Requires<[HasAVX]>,
   1822                        Sched<[WriteCvtF2F]>;
   1823 def Int_VCVTSD2SSrm: I<0x5A, MRMSrcReg,
   1824                        (outs VR128:$dst), (ins VR128:$src1, sdmem:$src2),
   1825                        "vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}",
   1826                        [(set VR128:$dst, (int_x86_sse2_cvtsd2ss
   1827                                           VR128:$src1, sse_load_f64:$src2))],
   1828                        IIC_SSE_CVT_Scalar_RM>, XD, VEX_4V, Requires<[HasAVX]>,
   1829                        Sched<[WriteCvtF2FLd, ReadAfterLd]>;
   1830 
   1831 let Constraints = "$src1 = $dst" in {
   1832 def Int_CVTSD2SSrr: I<0x5A, MRMSrcReg,
   1833                        (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
   1834                        "cvtsd2ss\t{$src2, $dst|$dst, $src2}",
   1835                        [(set VR128:$dst,
   1836                          (int_x86_sse2_cvtsd2ss VR128:$src1, VR128:$src2))],
   1837                        IIC_SSE_CVT_Scalar_RR>, XD, Requires<[UseSSE2]>,
   1838                        Sched<[WriteCvtF2F]>;
   1839 def Int_CVTSD2SSrm: I<0x5A, MRMSrcReg,
   1840                        (outs VR128:$dst), (ins VR128:$src1, sdmem:$src2),
   1841                        "cvtsd2ss\t{$src2, $dst|$dst, $src2}",
   1842                        [(set VR128:$dst, (int_x86_sse2_cvtsd2ss
   1843                                           VR128:$src1, sse_load_f64:$src2))],
   1844                        IIC_SSE_CVT_Scalar_RM>, XD, Requires<[UseSSE2]>,
   1845                        Sched<[WriteCvtF2FLd, ReadAfterLd]>;
   1846 }
   1847 } // isCodeGenOnly = 1
   1848 
   1849 // Convert scalar single to scalar double
   1850 // SSE2 instructions with XS prefix
   1851 let hasSideEffects = 0, Predicates = [UseAVX] in {
   1852 def VCVTSS2SDrr : I<0x5A, MRMSrcReg, (outs FR64:$dst),
   1853                     (ins FR32:$src1, FR32:$src2),
   1854                     "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
   1855                     [], IIC_SSE_CVT_Scalar_RR>,
   1856                     XS, Requires<[HasAVX]>, VEX_4V, VEX_LIG,
   1857                     Sched<[WriteCvtF2F]>;
   1858 let mayLoad = 1 in
   1859 def VCVTSS2SDrm : I<0x5A, MRMSrcMem, (outs FR64:$dst),
   1860                     (ins FR32:$src1, f32mem:$src2),
   1861                     "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
   1862                     [], IIC_SSE_CVT_Scalar_RM>,
   1863                     XS, VEX_4V, VEX_LIG, Requires<[HasAVX, OptForSize]>,
   1864                     Sched<[WriteCvtF2FLd, ReadAfterLd]>;
   1865 }
   1866 
   1867 def : Pat<(f64 (fextend FR32:$src)),
   1868     (VCVTSS2SDrr FR32:$src, FR32:$src)>, Requires<[UseAVX]>;
   1869 def : Pat<(fextend (loadf32 addr:$src)),
   1870     (VCVTSS2SDrm (f32 (IMPLICIT_DEF)), addr:$src)>, Requires<[UseAVX]>;
   1871 
   1872 def : Pat<(extloadf32 addr:$src),
   1873     (VCVTSS2SDrm (f32 (IMPLICIT_DEF)), addr:$src)>,
   1874     Requires<[UseAVX, OptForSize]>;
   1875 def : Pat<(extloadf32 addr:$src),
   1876     (VCVTSS2SDrr (f32 (IMPLICIT_DEF)), (VMOVSSrm addr:$src))>,
   1877     Requires<[UseAVX, OptForSpeed]>;
   1878 
   1879 def CVTSS2SDrr : I<0x5A, MRMSrcReg, (outs FR64:$dst), (ins FR32:$src),
   1880                    "cvtss2sd\t{$src, $dst|$dst, $src}",
   1881                    [(set FR64:$dst, (fextend FR32:$src))],
   1882                    IIC_SSE_CVT_Scalar_RR>, XS,
   1883                  Requires<[UseSSE2]>, Sched<[WriteCvtF2F]>;
   1884 def CVTSS2SDrm : I<0x5A, MRMSrcMem, (outs FR64:$dst), (ins f32mem:$src),
   1885                    "cvtss2sd\t{$src, $dst|$dst, $src}",
   1886                    [(set FR64:$dst, (extloadf32 addr:$src))],
   1887                    IIC_SSE_CVT_Scalar_RM>, XS,
   1888                  Requires<[UseSSE2, OptForSize]>, Sched<[WriteCvtF2FLd]>;
   1889 
   1890 // extload f32 -> f64.  This matches load+fextend because we have a hack in
   1891 // the isel (PreprocessForFPConvert) that can introduce loads after dag
   1892 // combine.
   1893 // Since these loads aren't folded into the fextend, we have to match it
   1894 // explicitly here.
   1895 def : Pat<(fextend (loadf32 addr:$src)),
   1896           (CVTSS2SDrm addr:$src)>, Requires<[UseSSE2]>;
   1897 def : Pat<(extloadf32 addr:$src),
   1898           (CVTSS2SDrr (MOVSSrm addr:$src))>, Requires<[UseSSE2, OptForSpeed]>;
   1899 
   1900 let isCodeGenOnly = 1 in {
   1901 def Int_VCVTSS2SDrr: I<0x5A, MRMSrcReg,
   1902                       (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
   1903                     "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
   1904                     [(set VR128:$dst,
   1905                       (int_x86_sse2_cvtss2sd VR128:$src1, VR128:$src2))],
   1906                     IIC_SSE_CVT_Scalar_RR>, XS, VEX_4V, Requires<[HasAVX]>,
   1907                     Sched<[WriteCvtF2F]>;
   1908 def Int_VCVTSS2SDrm: I<0x5A, MRMSrcMem,
   1909                       (outs VR128:$dst), (ins VR128:$src1, ssmem:$src2),
   1910                     "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
   1911                     [(set VR128:$dst,
   1912                       (int_x86_sse2_cvtss2sd VR128:$src1, sse_load_f32:$src2))],
   1913                     IIC_SSE_CVT_Scalar_RM>, XS, VEX_4V, Requires<[HasAVX]>,
   1914                     Sched<[WriteCvtF2FLd, ReadAfterLd]>;
   1915 let Constraints = "$src1 = $dst" in { // SSE2 instructions with XS prefix
   1916 def Int_CVTSS2SDrr: I<0x5A, MRMSrcReg,
   1917                       (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
   1918                     "cvtss2sd\t{$src2, $dst|$dst, $src2}",
   1919                     [(set VR128:$dst,
   1920                       (int_x86_sse2_cvtss2sd VR128:$src1, VR128:$src2))],
   1921                     IIC_SSE_CVT_Scalar_RR>, XS, Requires<[UseSSE2]>,
   1922                     Sched<[WriteCvtF2F]>;
   1923 def Int_CVTSS2SDrm: I<0x5A, MRMSrcMem,
   1924                       (outs VR128:$dst), (ins VR128:$src1, ssmem:$src2),
   1925                     "cvtss2sd\t{$src2, $dst|$dst, $src2}",
   1926                     [(set VR128:$dst,
   1927                       (int_x86_sse2_cvtss2sd VR128:$src1, sse_load_f32:$src2))],
   1928                     IIC_SSE_CVT_Scalar_RM>, XS, Requires<[UseSSE2]>,
   1929                     Sched<[WriteCvtF2FLd, ReadAfterLd]>;
   1930 }
   1931 } // isCodeGenOnly = 1
   1932 
   1933 // Convert packed single/double fp to doubleword
   1934 def VCVTPS2DQrr : VPDI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
   1935                        "cvtps2dq\t{$src, $dst|$dst, $src}",
   1936                        [(set VR128:$dst, (int_x86_sse2_cvtps2dq VR128:$src))],
   1937                        IIC_SSE_CVT_PS_RR>, VEX, Sched<[WriteCvtF2I]>;
   1938 def VCVTPS2DQrm : VPDI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
   1939                        "cvtps2dq\t{$src, $dst|$dst, $src}",
   1940                        [(set VR128:$dst,
   1941                          (int_x86_sse2_cvtps2dq (loadv4f32 addr:$src)))],
   1942                        IIC_SSE_CVT_PS_RM>, VEX, Sched<[WriteCvtF2ILd]>;
   1943 def VCVTPS2DQYrr : VPDI<0x5B, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
   1944                         "cvtps2dq\t{$src, $dst|$dst, $src}",
   1945                         [(set VR256:$dst,
   1946                           (int_x86_avx_cvt_ps2dq_256 VR256:$src))],
   1947                         IIC_SSE_CVT_PS_RR>, VEX, VEX_L, Sched<[WriteCvtF2I]>;
   1948 def VCVTPS2DQYrm : VPDI<0x5B, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
   1949                         "cvtps2dq\t{$src, $dst|$dst, $src}",
   1950                         [(set VR256:$dst,
   1951                           (int_x86_avx_cvt_ps2dq_256 (loadv8f32 addr:$src)))],
   1952                         IIC_SSE_CVT_PS_RM>, VEX, VEX_L, Sched<[WriteCvtF2ILd]>;
   1953 def CVTPS2DQrr : PDI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
   1954                      "cvtps2dq\t{$src, $dst|$dst, $src}",
   1955                      [(set VR128:$dst, (int_x86_sse2_cvtps2dq VR128:$src))],
   1956                      IIC_SSE_CVT_PS_RR>, Sched<[WriteCvtF2I]>;
   1957 def CVTPS2DQrm : PDI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
   1958                      "cvtps2dq\t{$src, $dst|$dst, $src}",
   1959                      [(set VR128:$dst,
   1960                        (int_x86_sse2_cvtps2dq (memopv4f32 addr:$src)))],
   1961                      IIC_SSE_CVT_PS_RM>, Sched<[WriteCvtF2ILd]>;
   1962 
   1963 
   1964 // Convert Packed Double FP to Packed DW Integers
   1965 let Predicates = [HasAVX] in {
   1966 // The assembler can recognize rr 256-bit instructions by seeing a ymm
   1967 // register, but the same isn't true when using memory operands instead.
   1968 // Provide other assembly rr and rm forms to address this explicitly.
   1969 def VCVTPD2DQrr  : SDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
   1970                        "vcvtpd2dq\t{$src, $dst|$dst, $src}",
   1971                        [(set VR128:$dst, (int_x86_sse2_cvtpd2dq VR128:$src))]>,
   1972                        VEX, Sched<[WriteCvtF2I]>;
   1973 
   1974 // XMM only
   1975 def : InstAlias<"vcvtpd2dqx\t{$src, $dst|$dst, $src}",
   1976                 (VCVTPD2DQrr VR128:$dst, VR128:$src), 0>;
   1977 def VCVTPD2DQXrm : SDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
   1978                        "vcvtpd2dqx\t{$src, $dst|$dst, $src}",
   1979                        [(set VR128:$dst,
   1980                          (int_x86_sse2_cvtpd2dq (loadv2f64 addr:$src)))]>, VEX,
   1981                        Sched<[WriteCvtF2ILd]>;
   1982 
   1983 // YMM only
   1984 def VCVTPD2DQYrr : SDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src),
   1985                        "vcvtpd2dq{y}\t{$src, $dst|$dst, $src}",
   1986                        [(set VR128:$dst,
   1987                          (int_x86_avx_cvt_pd2dq_256 VR256:$src))]>, VEX, VEX_L,
   1988                        Sched<[WriteCvtF2I]>;
   1989 def VCVTPD2DQYrm : SDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src),
   1990                        "vcvtpd2dq{y}\t{$src, $dst|$dst, $src}",
   1991                        [(set VR128:$dst,
   1992                          (int_x86_avx_cvt_pd2dq_256 (loadv4f64 addr:$src)))]>,
   1993                        VEX, VEX_L, Sched<[WriteCvtF2ILd]>;
   1994 def : InstAlias<"vcvtpd2dq\t{$src, $dst|$dst, $src}",
   1995                 (VCVTPD2DQYrr VR128:$dst, VR256:$src), 0>;
   1996 }
   1997 
   1998 def CVTPD2DQrm  : SDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
   1999                       "cvtpd2dq\t{$src, $dst|$dst, $src}",
   2000                       [(set VR128:$dst,
   2001                         (int_x86_sse2_cvtpd2dq (memopv2f64 addr:$src)))],
   2002                       IIC_SSE_CVT_PD_RM>, Sched<[WriteCvtF2ILd]>;
   2003 def CVTPD2DQrr  : SDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
   2004                       "cvtpd2dq\t{$src, $dst|$dst, $src}",
   2005                       [(set VR128:$dst, (int_x86_sse2_cvtpd2dq VR128:$src))],
   2006                       IIC_SSE_CVT_PD_RR>, Sched<[WriteCvtF2I]>;
   2007 
   2008 // Convert with truncation packed single/double fp to doubleword
   2009 // SSE2 packed instructions with XS prefix
   2010 def VCVTTPS2DQrr : VS2SI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
   2011                          "cvttps2dq\t{$src, $dst|$dst, $src}",
   2012                          [], IIC_SSE_CVT_PS_RR>, VEX, Sched<[WriteCvtF2I]>;
   2013 def VCVTTPS2DQrm : VS2SI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
   2014                          "cvttps2dq\t{$src, $dst|$dst, $src}",
   2015                          [], IIC_SSE_CVT_PS_RM>, VEX, Sched<[WriteCvtF2ILd]>;
   2016 def VCVTTPS2DQYrr : VS2SI<0x5B, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
   2017                           "cvttps2dq\t{$src, $dst|$dst, $src}",
   2018                           [], IIC_SSE_CVT_PS_RR>, VEX, VEX_L, Sched<[WriteCvtF2I]>;
   2019 def VCVTTPS2DQYrm : VS2SI<0x5B, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
   2020                           "cvttps2dq\t{$src, $dst|$dst, $src}",
   2021                           [], IIC_SSE_CVT_PS_RM>, VEX, VEX_L,
   2022                           Sched<[WriteCvtF2ILd]>;
   2023 
   2024 def CVTTPS2DQrr : S2SI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
   2025                        "cvttps2dq\t{$src, $dst|$dst, $src}",
   2026                        [], IIC_SSE_CVT_PS_RR>, Sched<[WriteCvtF2I]>;
   2027 def CVTTPS2DQrm : S2SI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
   2028                        "cvttps2dq\t{$src, $dst|$dst, $src}",
   2029                        [], IIC_SSE_CVT_PS_RM>, Sched<[WriteCvtF2ILd]>;
   2030 
   2031 let Predicates = [HasAVX] in {
   2032   def : Pat<(int_x86_sse2_cvtdq2ps VR128:$src),
   2033             (VCVTDQ2PSrr VR128:$src)>;
   2034   def : Pat<(int_x86_sse2_cvtdq2ps (bc_v4i32 (loadv2i64 addr:$src))),
   2035             (VCVTDQ2PSrm addr:$src)>;
   2036 }
   2037 
   2038 let Predicates = [HasAVX, NoVLX] in {
   2039   def : Pat<(v4f32 (sint_to_fp (v4i32 VR128:$src))),
   2040             (VCVTDQ2PSrr VR128:$src)>;
   2041   def : Pat<(v4f32 (sint_to_fp (bc_v4i32 (loadv2i64 addr:$src)))),
   2042             (VCVTDQ2PSrm addr:$src)>;
   2043 
   2044   def : Pat<(v4i32 (fp_to_sint (v4f32 VR128:$src))),
   2045             (VCVTTPS2DQrr VR128:$src)>;
   2046   def : Pat<(v4i32 (fp_to_sint (loadv4f32 addr:$src))),
   2047             (VCVTTPS2DQrm addr:$src)>;
   2048 
   2049   def : Pat<(v8f32 (sint_to_fp (v8i32 VR256:$src))),
   2050             (VCVTDQ2PSYrr VR256:$src)>;
   2051   def : Pat<(v8f32 (sint_to_fp (bc_v8i32 (loadv4i64 addr:$src)))),
   2052             (VCVTDQ2PSYrm addr:$src)>;
   2053 
   2054   def : Pat<(v8i32 (fp_to_sint (v8f32 VR256:$src))),
   2055             (VCVTTPS2DQYrr VR256:$src)>;
   2056   def : Pat<(v8i32 (fp_to_sint (loadv8f32 addr:$src))),
   2057             (VCVTTPS2DQYrm addr:$src)>;
   2058 }
   2059 
   2060 let Predicates = [UseSSE2] in {
   2061   def : Pat<(v4f32 (sint_to_fp (v4i32 VR128:$src))),
   2062             (CVTDQ2PSrr VR128:$src)>;
   2063   def : Pat<(v4f32 (sint_to_fp (bc_v4i32 (memopv2i64 addr:$src)))),
   2064             (CVTDQ2PSrm addr:$src)>;
   2065 
   2066   def : Pat<(int_x86_sse2_cvtdq2ps VR128:$src),
   2067             (CVTDQ2PSrr VR128:$src)>;
   2068   def : Pat<(int_x86_sse2_cvtdq2ps (bc_v4i32 (memopv2i64 addr:$src))),
   2069             (CVTDQ2PSrm addr:$src)>;
   2070 
   2071   def : Pat<(v4i32 (fp_to_sint (v4f32 VR128:$src))),
   2072             (CVTTPS2DQrr VR128:$src)>;
   2073   def : Pat<(v4i32 (fp_to_sint (memopv4f32 addr:$src))),
   2074             (CVTTPS2DQrm addr:$src)>;
   2075 }
   2076 
   2077 def VCVTTPD2DQrr : VPDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
   2078                         "cvttpd2dq\t{$src, $dst|$dst, $src}",
   2079                         [(set VR128:$dst,
   2080                               (int_x86_sse2_cvttpd2dq VR128:$src))],
   2081                               IIC_SSE_CVT_PD_RR>, VEX, Sched<[WriteCvtF2I]>;
   2082 
   2083 // The assembler can recognize rr 256-bit instructions by seeing a ymm
   2084 // register, but the same isn't true when using memory operands instead.
   2085 // Provide other assembly rr and rm forms to address this explicitly.
   2086 
   2087 // XMM only
   2088 def : InstAlias<"vcvttpd2dqx\t{$src, $dst|$dst, $src}",
   2089                 (VCVTTPD2DQrr VR128:$dst, VR128:$src), 0>;
   2090 def VCVTTPD2DQXrm : VPDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
   2091                          "cvttpd2dqx\t{$src, $dst|$dst, $src}",
   2092                          [(set VR128:$dst, (int_x86_sse2_cvttpd2dq
   2093                                             (loadv2f64 addr:$src)))],
   2094                          IIC_SSE_CVT_PD_RM>, VEX, Sched<[WriteCvtF2ILd]>;
   2095 
   2096 // YMM only
   2097 def VCVTTPD2DQYrr : VPDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src),
   2098                          "cvttpd2dq{y}\t{$src, $dst|$dst, $src}",
   2099                          [], IIC_SSE_CVT_PD_RR>, VEX, VEX_L, Sched<[WriteCvtF2I]>;
   2100 def VCVTTPD2DQYrm : VPDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src),
   2101                          "cvttpd2dq{y}\t{$src, $dst|$dst, $src}",
   2102                          [], IIC_SSE_CVT_PD_RM>, VEX, VEX_L, Sched<[WriteCvtF2ILd]>;
   2103 def : InstAlias<"vcvttpd2dq\t{$src, $dst|$dst, $src}",
   2104                 (VCVTTPD2DQYrr VR128:$dst, VR256:$src), 0>;
   2105 
   2106 let Predicates = [HasAVX, NoVLX] in {
   2107   def : Pat<(v4i32 (fp_to_sint (v4f64 VR256:$src))),
   2108             (VCVTTPD2DQYrr VR256:$src)>;
   2109   def : Pat<(v4i32 (fp_to_sint (loadv4f64 addr:$src))),
   2110             (VCVTTPD2DQYrm addr:$src)>;
   2111 } // Predicates = [HasAVX]
   2112 
   2113 def CVTTPD2DQrr : PDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
   2114                       "cvttpd2dq\t{$src, $dst|$dst, $src}",
   2115                       [(set VR128:$dst, (int_x86_sse2_cvttpd2dq VR128:$src))],
   2116                       IIC_SSE_CVT_PD_RR>, Sched<[WriteCvtF2I]>;
   2117 def CVTTPD2DQrm : PDI<0xE6, MRMSrcMem, (outs VR128:$dst),(ins f128mem:$src),
   2118                       "cvttpd2dq\t{$src, $dst|$dst, $src}",
   2119                       [(set VR128:$dst, (int_x86_sse2_cvttpd2dq
   2120                                         (memopv2f64 addr:$src)))],
   2121                                         IIC_SSE_CVT_PD_RM>,
   2122                       Sched<[WriteCvtF2ILd]>;
   2123 
   2124 // Convert packed single to packed double
   2125 let Predicates = [HasAVX] in {
   2126                   // SSE2 instructions without OpSize prefix
   2127 def VCVTPS2PDrr : I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
   2128                     "vcvtps2pd\t{$src, $dst|$dst, $src}",
   2129                     [], IIC_SSE_CVT_PD_RR>, PS, VEX, Sched<[WriteCvtF2F]>;
   2130 def VCVTPS2PDrm : I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src),
   2131                     "vcvtps2pd\t{$src, $dst|$dst, $src}",
   2132                     [(set VR128:$dst, (v2f64 (extloadv2f32 addr:$src)))],
   2133                     IIC_SSE_CVT_PD_RM>, PS, VEX, Sched<[WriteCvtF2FLd]>;
   2134 def VCVTPS2PDYrr : I<0x5A, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src),
   2135                      "vcvtps2pd\t{$src, $dst|$dst, $src}",
   2136                      [], IIC_SSE_CVT_PD_RR>, PS, VEX, VEX_L, Sched<[WriteCvtF2F]>;
   2137 def VCVTPS2PDYrm : I<0x5A, MRMSrcMem, (outs VR256:$dst), (ins f128mem:$src),
   2138                      "vcvtps2pd\t{$src, $dst|$dst, $src}",
   2139                      [], IIC_SSE_CVT_PD_RM>, PS, VEX, VEX_L, Sched<[WriteCvtF2FLd]>;
   2140 }
   2141 
   2142 let Predicates = [UseSSE2] in {
   2143 def CVTPS2PDrr : I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
   2144                    "cvtps2pd\t{$src, $dst|$dst, $src}",
   2145                    [], IIC_SSE_CVT_PD_RR>, PS, Sched<[WriteCvtF2F]>;
   2146 def CVTPS2PDrm : I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src),
   2147                    "cvtps2pd\t{$src, $dst|$dst, $src}",
   2148                    [(set VR128:$dst, (v2f64 (extloadv2f32 addr:$src)))],
   2149                    IIC_SSE_CVT_PD_RM>, PS, Sched<[WriteCvtF2FLd]>;
   2150 }
   2151 
   2152 // Convert Packed DW Integers to Packed Double FP
   2153 let Predicates = [HasAVX] in {
   2154 let hasSideEffects = 0, mayLoad = 1 in
   2155 def VCVTDQ2PDrm  : S2SI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
   2156                         "vcvtdq2pd\t{$src, $dst|$dst, $src}",
   2157                         []>, VEX, Sched<[WriteCvtI2FLd]>;
   2158 def VCVTDQ2PDrr  : S2SI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
   2159                         "vcvtdq2pd\t{$src, $dst|$dst, $src}",
   2160                         []>, VEX, Sched<[WriteCvtI2F]>;
   2161 def VCVTDQ2PDYrm  : S2SI<0xE6, MRMSrcMem, (outs VR256:$dst), (ins i128mem:$src),
   2162                          "vcvtdq2pd\t{$src, $dst|$dst, $src}",
   2163                          []>, VEX, VEX_L, Sched<[WriteCvtI2FLd]>;
   2164 def VCVTDQ2PDYrr  : S2SI<0xE6, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src),
   2165                          "vcvtdq2pd\t{$src, $dst|$dst, $src}",
   2166                          []>, VEX, VEX_L, Sched<[WriteCvtI2F]>;
   2167 }
   2168 
   2169 let hasSideEffects = 0, mayLoad = 1 in
   2170 def CVTDQ2PDrm  : S2SI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
   2171                        "cvtdq2pd\t{$src, $dst|$dst, $src}", [],
   2172                        IIC_SSE_CVT_PD_RR>, Sched<[WriteCvtI2FLd]>;
   2173 def CVTDQ2PDrr  : S2SI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
   2174                        "cvtdq2pd\t{$src, $dst|$dst, $src}", [],
   2175                        IIC_SSE_CVT_PD_RM>, Sched<[WriteCvtI2F]>;
   2176 
   2177 // AVX register conversion intrinsics
   2178 let Predicates = [HasAVX] in {
   2179   def : Pat<(v2f64 (X86cvtdq2pd (v4i32 VR128:$src))),
   2180             (VCVTDQ2PDrr VR128:$src)>;
   2181   def : Pat<(v2f64 (X86cvtdq2pd (bc_v4i32 (loadv2i64 addr:$src)))),
   2182             (VCVTDQ2PDrm addr:$src)>;
   2183   def : Pat<(v2f64 (X86cvtdq2pd (bc_v4i32 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
   2184             (VCVTDQ2PDrm addr:$src)>;
   2185 
   2186   def : Pat<(v4f64 (sint_to_fp (v4i32 VR128:$src))),
   2187             (VCVTDQ2PDYrr VR128:$src)>;
   2188   def : Pat<(v4f64 (sint_to_fp (bc_v4i32 (loadv2i64 addr:$src)))),
   2189             (VCVTDQ2PDYrm addr:$src)>;
   2190 } // Predicates = [HasAVX]
   2191 
   2192 // SSE2 register conversion intrinsics
   2193 let Predicates = [HasSSE2] in {
   2194   def : Pat<(v2f64 (X86cvtdq2pd (v4i32 VR128:$src))),
   2195             (CVTDQ2PDrr VR128:$src)>;
   2196   def : Pat<(v2f64 (X86cvtdq2pd (bc_v4i32 (loadv2i64 addr:$src)))),
   2197             (CVTDQ2PDrm addr:$src)>;
   2198   def : Pat<(v2f64 (X86cvtdq2pd (bc_v4i32 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
   2199             (CVTDQ2PDrm addr:$src)>;
   2200 } // Predicates = [HasSSE2]
   2201 
   2202 // Convert packed double to packed single
   2203 // The assembler can recognize rr 256-bit instructions by seeing a ymm
   2204 // register, but the same isn't true when using memory operands instead.
   2205 // Provide other assembly rr and rm forms to address this explicitly.
   2206 def VCVTPD2PSrr : VPDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
   2207                        "cvtpd2ps\t{$src, $dst|$dst, $src}",
   2208                        [(set VR128:$dst, (int_x86_sse2_cvtpd2ps VR128:$src))],
   2209                        IIC_SSE_CVT_PD_RR>, VEX, Sched<[WriteCvtF2F]>;
   2210 
   2211 // XMM only
   2212 def : InstAlias<"vcvtpd2psx\t{$src, $dst|$dst, $src}",
   2213                 (VCVTPD2PSrr VR128:$dst, VR128:$src), 0>;
   2214 def VCVTPD2PSXrm : VPDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
   2215                         "cvtpd2psx\t{$src, $dst|$dst, $src}",
   2216                         [(set VR128:$dst,
   2217                           (int_x86_sse2_cvtpd2ps (loadv2f64 addr:$src)))],
   2218                         IIC_SSE_CVT_PD_RM>, VEX, Sched<[WriteCvtF2FLd]>;
   2219 
   2220 // YMM only
   2221 def VCVTPD2PSYrr : VPDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src),
   2222                         "cvtpd2ps{y}\t{$src, $dst|$dst, $src}",
   2223                         [(set VR128:$dst,
   2224                           (int_x86_avx_cvt_pd2_ps_256 VR256:$src))],
   2225                         IIC_SSE_CVT_PD_RR>, VEX, VEX_L, Sched<[WriteCvtF2F]>;
   2226 def VCVTPD2PSYrm : VPDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src),
   2227                         "cvtpd2ps{y}\t{$src, $dst|$dst, $src}",
   2228                         [(set VR128:$dst,
   2229                           (int_x86_avx_cvt_pd2_ps_256 (loadv4f64 addr:$src)))],
   2230                         IIC_SSE_CVT_PD_RM>, VEX, VEX_L, Sched<[WriteCvtF2FLd]>;
   2231 def : InstAlias<"vcvtpd2ps\t{$src, $dst|$dst, $src}",
   2232                 (VCVTPD2PSYrr VR128:$dst, VR256:$src), 0>;
   2233 
   2234 def CVTPD2PSrr : PDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
   2235                      "cvtpd2ps\t{$src, $dst|$dst, $src}",
   2236                      [(set VR128:$dst, (int_x86_sse2_cvtpd2ps VR128:$src))],
   2237                      IIC_SSE_CVT_PD_RR>, Sched<[WriteCvtF2F]>;
   2238 def CVTPD2PSrm : PDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
   2239                      "cvtpd2ps\t{$src, $dst|$dst, $src}",
   2240                      [(set VR128:$dst,
   2241                        (int_x86_sse2_cvtpd2ps (memopv2f64 addr:$src)))],
   2242                      IIC_SSE_CVT_PD_RM>, Sched<[WriteCvtF2FLd]>;
   2243 
   2244 
   2245 // AVX 256-bit register conversion intrinsics
   2246 // FIXME: Migrate SSE conversion intrinsics matching to use patterns as below
   2247 // whenever possible to avoid declaring two versions of each one.
   2248 let Predicates = [HasAVX] in {
   2249   def : Pat<(int_x86_avx_cvtdq2_ps_256 VR256:$src),
   2250             (VCVTDQ2PSYrr VR256:$src)>;
   2251   def : Pat<(int_x86_avx_cvtdq2_ps_256 (bitconvert (loadv4i64 addr:$src))),
   2252             (VCVTDQ2PSYrm addr:$src)>;
   2253 }
   2254 
   2255 let Predicates = [HasAVX, NoVLX] in {
   2256   // Match fround and fextend for 128/256-bit conversions
   2257   def : Pat<(v4f32 (X86vfpround (v2f64 VR128:$src))),
   2258             (VCVTPD2PSrr VR128:$src)>;
   2259   def : Pat<(v4f32 (X86vfpround (loadv2f64 addr:$src))),
   2260             (VCVTPD2PSXrm addr:$src)>;
   2261   def : Pat<(v4f32 (fround (v4f64 VR256:$src))),
   2262             (VCVTPD2PSYrr VR256:$src)>;
   2263   def : Pat<(v4f32 (fround (loadv4f64 addr:$src))),
   2264             (VCVTPD2PSYrm addr:$src)>;
   2265 
   2266   def : Pat<(v2f64 (X86vfpext (v4f32 VR128:$src))),
   2267             (VCVTPS2PDrr VR128:$src)>;
   2268   def : Pat<(v4f64 (fextend (v4f32 VR128:$src))),
   2269             (VCVTPS2PDYrr VR128:$src)>;
   2270   def : Pat<(v4f64 (extloadv4f32 addr:$src)),
   2271             (VCVTPS2PDYrm addr:$src)>;
   2272 }
   2273 
   2274 let Predicates = [UseSSE2] in {
   2275   // Match fround and fextend for 128 conversions
   2276   def : Pat<(v4f32 (X86vfpround (v2f64 VR128:$src))),
   2277             (CVTPD2PSrr VR128:$src)>;
   2278   def : Pat<(v4f32 (X86vfpround (memopv2f64 addr:$src))),
   2279             (CVTPD2PSrm addr:$src)>;
   2280 
   2281   def : Pat<(v2f64 (X86vfpext (v4f32 VR128:$src))),
   2282             (CVTPS2PDrr VR128:$src)>;
   2283 }
   2284 
   2285 //===----------------------------------------------------------------------===//
   2286 // SSE 1 & 2 - Compare Instructions
   2287 //===----------------------------------------------------------------------===//
   2288 
   2289 // sse12_cmp_scalar - sse 1 & 2 compare scalar instructions
   2290 multiclass sse12_cmp_scalar<RegisterClass RC, X86MemOperand x86memop,
   2291                             Operand CC, SDNode OpNode, ValueType VT,
   2292                             PatFrag ld_frag, string asm, string asm_alt,
   2293                             OpndItins itins, ImmLeaf immLeaf> {
   2294   def rr : SIi8<0xC2, MRMSrcReg,
   2295                 (outs RC:$dst), (ins RC:$src1, RC:$src2, CC:$cc), asm,
   2296                 [(set RC:$dst, (OpNode (VT RC:$src1), RC:$src2, immLeaf:$cc))],
   2297                 itins.rr>, Sched<[itins.Sched]>;
   2298   def rm : SIi8<0xC2, MRMSrcMem,
   2299                 (outs RC:$dst), (ins RC:$src1, x86memop:$src2, CC:$cc), asm,
   2300                 [(set RC:$dst, (OpNode (VT RC:$src1),
   2301                                          (ld_frag addr:$src2), immLeaf:$cc))],
   2302                                          itins.rm>,
   2303            Sched<[itins.Sched.Folded, ReadAfterLd]>;
   2304 
   2305   // Accept explicit immediate argument form instead of comparison code.
   2306   let isAsmParserOnly = 1, hasSideEffects = 0 in {
   2307     def rr_alt : SIi8<0xC2, MRMSrcReg, (outs RC:$dst),
   2308                       (ins RC:$src1, RC:$src2, u8imm:$cc), asm_alt, [],
   2309                       IIC_SSE_ALU_F32S_RR>, Sched<[itins.Sched]>;
   2310     let mayLoad = 1 in
   2311     def rm_alt : SIi8<0xC2, MRMSrcMem, (outs RC:$dst),
   2312                       (ins RC:$src1, x86memop:$src2, u8imm:$cc), asm_alt, [],
   2313                       IIC_SSE_ALU_F32S_RM>,
   2314                       Sched<[itins.Sched.Folded, ReadAfterLd]>;
   2315   }
   2316 }
   2317 
   2318 defm VCMPSS : sse12_cmp_scalar<FR32, f32mem, AVXCC, X86cmps, f32, loadf32,
   2319                  "cmp${cc}ss\t{$src2, $src1, $dst|$dst, $src1, $src2}",
   2320                  "cmpss\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
   2321                  SSE_ALU_F32S, i8immZExt5>, XS, VEX_4V, VEX_LIG;
   2322 defm VCMPSD : sse12_cmp_scalar<FR64, f64mem, AVXCC, X86cmps, f64, loadf64,
   2323                  "cmp${cc}sd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
   2324                  "cmpsd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
   2325                  SSE_ALU_F32S, i8immZExt5>, // same latency as 32 bit compare
   2326                  XD, VEX_4V, VEX_LIG;
   2327 
   2328 let Constraints = "$src1 = $dst" in {
   2329   defm CMPSS : sse12_cmp_scalar<FR32, f32mem, SSECC, X86cmps, f32, loadf32,
   2330                   "cmp${cc}ss\t{$src2, $dst|$dst, $src2}",
   2331                   "cmpss\t{$cc, $src2, $dst|$dst, $src2, $cc}", SSE_ALU_F32S,
   2332                   i8immZExt3>, XS;
   2333   defm CMPSD : sse12_cmp_scalar<FR64, f64mem, SSECC, X86cmps, f64, loadf64,
   2334                   "cmp${cc}sd\t{$src2, $dst|$dst, $src2}",
   2335                   "cmpsd\t{$cc, $src2, $dst|$dst, $src2, $cc}",
   2336                   SSE_ALU_F64S, i8immZExt3>, XD;
   2337 }
   2338 
   2339 multiclass sse12_cmp_scalar_int<X86MemOperand x86memop, Operand CC,
   2340                          Intrinsic Int, string asm, OpndItins itins,
   2341                          ImmLeaf immLeaf> {
   2342   def rr : SIi8<0xC2, MRMSrcReg, (outs VR128:$dst),
   2343                       (ins VR128:$src1, VR128:$src, CC:$cc), asm,
   2344                         [(set VR128:$dst, (Int VR128:$src1,
   2345                                                VR128:$src, immLeaf:$cc))],
   2346                                                itins.rr>,
   2347            Sched<[itins.Sched]>;
   2348   def rm : SIi8<0xC2, MRMSrcMem, (outs VR128:$dst),
   2349                       (ins VR128:$src1, x86memop:$src, CC:$cc), asm,
   2350                         [(set VR128:$dst, (Int VR128:$src1,
   2351                                                (load addr:$src), immLeaf:$cc))],
   2352                                                itins.rm>,
   2353            Sched<[itins.Sched.Folded, ReadAfterLd]>;
   2354 }
   2355 
   2356 let isCodeGenOnly = 1 in {
   2357   // Aliases to match intrinsics which expect XMM operand(s).
   2358   defm Int_VCMPSS  : sse12_cmp_scalar_int<f32mem, AVXCC, int_x86_sse_cmp_ss,
   2359                        "cmp${cc}ss\t{$src, $src1, $dst|$dst, $src1, $src}",
   2360                        SSE_ALU_F32S, i8immZExt5>,
   2361                        XS, VEX_4V;
   2362   defm Int_VCMPSD  : sse12_cmp_scalar_int<f64mem, AVXCC, int_x86_sse2_cmp_sd,
   2363                        "cmp${cc}sd\t{$src, $src1, $dst|$dst, $src1, $src}",
   2364                        SSE_ALU_F32S, i8immZExt5>, // same latency as f32
   2365                        XD, VEX_4V;
   2366   let Constraints = "$src1 = $dst" in {
   2367     defm Int_CMPSS  : sse12_cmp_scalar_int<f32mem, SSECC, int_x86_sse_cmp_ss,
   2368                          "cmp${cc}ss\t{$src, $dst|$dst, $src}",
   2369                          SSE_ALU_F32S, i8immZExt3>, XS;
   2370     defm Int_CMPSD  : sse12_cmp_scalar_int<f64mem, SSECC, int_x86_sse2_cmp_sd,
   2371                          "cmp${cc}sd\t{$src, $dst|$dst, $src}",
   2372                          SSE_ALU_F64S, i8immZExt3>,
   2373                          XD;
   2374 }
   2375 }
   2376 
   2377 
   2378 // sse12_ord_cmp - Unordered/Ordered scalar fp compare and set EFLAGS
   2379 multiclass sse12_ord_cmp<bits<8> opc, RegisterClass RC, SDNode OpNode,
   2380                             ValueType vt, X86MemOperand x86memop,
   2381                             PatFrag ld_frag, string OpcodeStr> {
   2382   def rr: SI<opc, MRMSrcReg, (outs), (ins RC:$src1, RC:$src2),
   2383                      !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
   2384                      [(set EFLAGS, (OpNode (vt RC:$src1), RC:$src2))],
   2385                      IIC_SSE_COMIS_RR>,
   2386           Sched<[WriteFAdd]>;
   2387   def rm: SI<opc, MRMSrcMem, (outs), (ins RC:$src1, x86memop:$src2),
   2388                      !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
   2389                      [(set EFLAGS, (OpNode (vt RC:$src1),
   2390                                            (ld_frag addr:$src2)))],
   2391                                            IIC_SSE_COMIS_RM>,
   2392           Sched<[WriteFAddLd, ReadAfterLd]>;
   2393 }
   2394 
   2395 let Defs = [EFLAGS] in {
   2396   defm VUCOMISS : sse12_ord_cmp<0x2E, FR32, X86cmp, f32, f32mem, loadf32,
   2397                                   "ucomiss">, PS, VEX, VEX_LIG;
   2398   defm VUCOMISD : sse12_ord_cmp<0x2E, FR64, X86cmp, f64, f64mem, loadf64,
   2399                                   "ucomisd">, PD, VEX, VEX_LIG;
   2400   let Pattern = []<dag> in {
   2401     defm VCOMISS  : sse12_ord_cmp<0x2F, FR32, undef, f32, f32mem, loadf32,
   2402                                     "comiss">, PS, VEX, VEX_LIG;
   2403     defm VCOMISD  : sse12_ord_cmp<0x2F, FR64, undef, f64, f64mem, loadf64,
   2404                                     "comisd">, PD, VEX, VEX_LIG;
   2405   }
   2406 
   2407   let isCodeGenOnly = 1 in {
   2408     defm Int_VUCOMISS  : sse12_ord_cmp<0x2E, VR128, X86ucomi, v4f32, f128mem,
   2409                               load, "ucomiss">, PS, VEX;
   2410     defm Int_VUCOMISD  : sse12_ord_cmp<0x2E, VR128, X86ucomi, v2f64, f128mem,
   2411                               load, "ucomisd">, PD, VEX;
   2412 
   2413     defm Int_VCOMISS  : sse12_ord_cmp<0x2F, VR128, X86comi, v4f32, f128mem,
   2414                               load, "comiss">, PS, VEX;
   2415     defm Int_VCOMISD  : sse12_ord_cmp<0x2F, VR128, X86comi, v2f64, f128mem,
   2416                               load, "comisd">, PD, VEX;
   2417   }
   2418   defm UCOMISS  : sse12_ord_cmp<0x2E, FR32, X86cmp, f32, f32mem, loadf32,
   2419                                   "ucomiss">, PS;
   2420   defm UCOMISD  : sse12_ord_cmp<0x2E, FR64, X86cmp, f64, f64mem, loadf64,
   2421                                   "ucomisd">, PD;
   2422 
   2423   let Pattern = []<dag> in {
   2424     defm COMISS  : sse12_ord_cmp<0x2F, FR32, undef, f32, f32mem, loadf32,
   2425                                     "comiss">, PS;
   2426     defm COMISD  : sse12_ord_cmp<0x2F, FR64, undef, f64, f64mem, loadf64,
   2427                                     "comisd">, PD;
   2428   }
   2429 
   2430   let isCodeGenOnly = 1 in {
   2431     defm Int_UCOMISS  : sse12_ord_cmp<0x2E, VR128, X86ucomi, v4f32, f128mem,
   2432                                 load, "ucomiss">, PS;
   2433     defm Int_UCOMISD  : sse12_ord_cmp<0x2E, VR128, X86ucomi, v2f64, f128mem,
   2434                                 load, "ucomisd">, PD;
   2435 
   2436     defm Int_COMISS  : sse12_ord_cmp<0x2F, VR128, X86comi, v4f32, f128mem, load,
   2437                                     "comiss">, PS;
   2438     defm Int_COMISD  : sse12_ord_cmp<0x2F, VR128, X86comi, v2f64, f128mem, load,
   2439                                     "comisd">, PD;
   2440   }
   2441 } // Defs = [EFLAGS]
   2442 
   2443 // sse12_cmp_packed - sse 1 & 2 compare packed instructions
   2444 multiclass sse12_cmp_packed<RegisterClass RC, X86MemOperand x86memop,
   2445                             Operand CC, Intrinsic Int, string asm,
   2446                             string asm_alt, Domain d, ImmLeaf immLeaf,
   2447                             PatFrag ld_frag, OpndItins itins = SSE_ALU_F32P> {
   2448   let isCommutable = 1 in
   2449   def rri : PIi8<0xC2, MRMSrcReg,
   2450              (outs RC:$dst), (ins RC:$src1, RC:$src2, CC:$cc), asm,
   2451              [(set RC:$dst, (Int RC:$src1, RC:$src2, immLeaf:$cc))],
   2452              itins.rr, d>,
   2453             Sched<[WriteFAdd]>;
   2454   def rmi : PIi8<0xC2, MRMSrcMem,
   2455              (outs RC:$dst), (ins RC:$src1, x86memop:$src2, CC:$cc), asm,
   2456              [(set RC:$dst, (Int RC:$src1, (ld_frag addr:$src2), immLeaf:$cc))],
   2457              itins.rm, d>,
   2458             Sched<[WriteFAddLd, ReadAfterLd]>;
   2459 
   2460   // Accept explicit immediate argument form instead of comparison code.
   2461   let isAsmParserOnly = 1, hasSideEffects = 0 in {
   2462     def rri_alt : PIi8<0xC2, MRMSrcReg,
   2463                (outs RC:$dst), (ins RC:$src1, RC:$src2, u8imm:$cc),
   2464                asm_alt, [], itins.rr, d>, Sched<[WriteFAdd]>;
   2465     let mayLoad = 1 in
   2466     def rmi_alt : PIi8<0xC2, MRMSrcMem,
   2467                (outs RC:$dst), (ins RC:$src1, x86memop:$src2, u8imm:$cc),
   2468                asm_alt, [], itins.rm, d>,
   2469                Sched<[WriteFAddLd, ReadAfterLd]>;
   2470   }
   2471 }
   2472 
   2473 defm VCMPPS : sse12_cmp_packed<VR128, f128mem, AVXCC, int_x86_sse_cmp_ps,
   2474                "cmp${cc}ps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
   2475                "cmpps\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
   2476                SSEPackedSingle, i8immZExt5, loadv4f32>, PS, VEX_4V;
   2477 defm VCMPPD : sse12_cmp_packed<VR128, f128mem, AVXCC, int_x86_sse2_cmp_pd,
   2478                "cmp${cc}pd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
   2479                "cmppd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
   2480                SSEPackedDouble, i8immZExt5, loadv2f64>, PD, VEX_4V;
   2481 defm VCMPPSY : sse12_cmp_packed<VR256, f256mem, AVXCC, int_x86_avx_cmp_ps_256,
   2482                "cmp${cc}ps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
   2483                "cmpps\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
   2484                SSEPackedSingle, i8immZExt5, loadv8f32>, PS, VEX_4V, VEX_L;
   2485 defm VCMPPDY : sse12_cmp_packed<VR256, f256mem, AVXCC, int_x86_avx_cmp_pd_256,
   2486                "cmp${cc}pd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
   2487                "cmppd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
   2488                SSEPackedDouble, i8immZExt5, loadv4f64>, PD, VEX_4V, VEX_L;
   2489 let Constraints = "$src1 = $dst" in {
   2490   defm CMPPS : sse12_cmp_packed<VR128, f128mem, SSECC, int_x86_sse_cmp_ps,
   2491                  "cmp${cc}ps\t{$src2, $dst|$dst, $src2}",
   2492                  "cmpps\t{$cc, $src2, $dst|$dst, $src2, $cc}",
   2493                  SSEPackedSingle, i8immZExt5, memopv4f32, SSE_ALU_F32P>, PS;
   2494   defm CMPPD : sse12_cmp_packed<VR128, f128mem, SSECC, int_x86_sse2_cmp_pd,
   2495                  "cmp${cc}pd\t{$src2, $dst|$dst, $src2}",
   2496                  "cmppd\t{$cc, $src2, $dst|$dst, $src2, $cc}",
   2497                  SSEPackedDouble, i8immZExt5, memopv2f64, SSE_ALU_F64P>, PD;
   2498 }
   2499 
   2500 let Predicates = [HasAVX] in {
   2501 def : Pat<(v4f32 (X86cmpp (v4f32 VR128:$src1), VR128:$src2, imm:$cc)),
   2502           (VCMPPSrri (v4f32 VR128:$src1), (v4f32 VR128:$src2), imm:$cc)>;
   2503 def : Pat<(v4f32 (X86cmpp (v4f32 VR128:$src1), (loadv4f32 addr:$src2), imm:$cc)),
   2504           (VCMPPSrmi (v4f32 VR128:$src1), addr:$src2, imm:$cc)>;
   2505 def : Pat<(v2f64 (X86cmpp (v2f64 VR128:$src1), VR128:$src2, imm:$cc)),
   2506           (VCMPPDrri VR128:$src1, VR128:$src2, imm:$cc)>;
   2507 def : Pat<(v2f64 (X86cmpp (v2f64 VR128:$src1), (loadv2f64 addr:$src2), imm:$cc)),
   2508           (VCMPPDrmi VR128:$src1, addr:$src2, imm:$cc)>;
   2509 
   2510 def : Pat<(v8f32 (X86cmpp (v8f32 VR256:$src1), VR256:$src2, imm:$cc)),
   2511           (VCMPPSYrri (v8f32 VR256:$src1), (v8f32 VR256:$src2), imm:$cc)>;
   2512 def : Pat<(v8f32 (X86cmpp (v8f32 VR256:$src1), (loadv8f32 addr:$src2), imm:$cc)),
   2513           (VCMPPSYrmi (v8f32 VR256:$src1), addr:$src2, imm:$cc)>;
   2514 def : Pat<(v4f64 (X86cmpp (v4f64 VR256:$src1), VR256:$src2, imm:$cc)),
   2515           (VCMPPDYrri VR256:$src1, VR256:$src2, imm:$cc)>;
   2516 def : Pat<(v4f64 (X86cmpp (v4f64 VR256:$src1), (loadv4f64 addr:$src2), imm:$cc)),
   2517           (VCMPPDYrmi VR256:$src1, addr:$src2, imm:$cc)>;
   2518 }
   2519 
   2520 let Predicates = [UseSSE1] in {
   2521 def : Pat<(v4f32 (X86cmpp (v4f32 VR128:$src1), VR128:$src2, imm:$cc)),
   2522           (CMPPSrri (v4f32 VR128:$src1), (v4f32 VR128:$src2), imm:$cc)>;
   2523 def : Pat<(v4f32 (X86cmpp (v4f32 VR128:$src1), (memopv4f32 addr:$src2), imm:$cc)),
   2524           (CMPPSrmi (v4f32 VR128:$src1), addr:$src2, imm:$cc)>;
   2525 }
   2526 
   2527 let Predicates = [UseSSE2] in {
   2528 def : Pat<(v2f64 (X86cmpp (v2f64 VR128:$src1), VR128:$src2, imm:$cc)),
   2529           (CMPPDrri VR128:$src1, VR128:$src2, imm:$cc)>;
   2530 def : Pat<(v2f64 (X86cmpp (v2f64 VR128:$src1), (memopv2f64 addr:$src2), imm:$cc)),
   2531           (CMPPDrmi VR128:$src1, addr:$src2, imm:$cc)>;
   2532 }
   2533 
   2534 //===----------------------------------------------------------------------===//
   2535 // SSE 1 & 2 - Shuffle Instructions
   2536 //===----------------------------------------------------------------------===//
   2537 
   2538 /// sse12_shuffle - sse 1 & 2 fp shuffle instructions
   2539 multiclass sse12_shuffle<RegisterClass RC, X86MemOperand x86memop,
   2540                          ValueType vt, string asm, PatFrag mem_frag,
   2541                          Domain d> {
   2542   def rmi : PIi8<0xC6, MRMSrcMem, (outs RC:$dst),
   2543                    (ins RC:$src1, x86memop:$src2, u8imm:$src3), asm,
   2544                    [(set RC:$dst, (vt (X86Shufp RC:$src1, (mem_frag addr:$src2),
   2545                                        (i8 imm:$src3))))], IIC_SSE_SHUFP, d>,
   2546             Sched<[WriteFShuffleLd, ReadAfterLd]>;
   2547   def rri : PIi8<0xC6, MRMSrcReg, (outs RC:$dst),
   2548                  (ins RC:$src1, RC:$src2, u8imm:$src3), asm,
   2549                  [(set RC:$dst, (vt (X86Shufp RC:$src1, RC:$src2,
   2550                                      (i8 imm:$src3))))], IIC_SSE_SHUFP, d>,
   2551             Sched<[WriteFShuffle]>;
   2552 }
   2553 
   2554 let Predicates = [HasAVX, NoVLX] in {
   2555   defm VSHUFPS  : sse12_shuffle<VR128, f128mem, v4f32,
   2556            "shufps\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
   2557            loadv4f32, SSEPackedSingle>, PS, VEX_4V;
   2558   defm VSHUFPSY : sse12_shuffle<VR256, f256mem, v8f32,
   2559            "shufps\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
   2560            loadv8f32, SSEPackedSingle>, PS, VEX_4V, VEX_L;
   2561   defm VSHUFPD  : sse12_shuffle<VR128, f128mem, v2f64,
   2562            "shufpd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
   2563            loadv2f64, SSEPackedDouble>, PD, VEX_4V;
   2564   defm VSHUFPDY : sse12_shuffle<VR256, f256mem, v4f64,
   2565            "shufpd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
   2566            loadv4f64, SSEPackedDouble>, PD, VEX_4V, VEX_L;
   2567 }
   2568 let Constraints = "$src1 = $dst" in {
   2569   defm SHUFPS : sse12_shuffle<VR128, f128mem, v4f32,
   2570                     "shufps\t{$src3, $src2, $dst|$dst, $src2, $src3}",
   2571                     memopv4f32, SSEPackedSingle>, PS;
   2572   defm SHUFPD : sse12_shuffle<VR128, f128mem, v2f64,
   2573                     "shufpd\t{$src3, $src2, $dst|$dst, $src2, $src3}",
   2574                     memopv2f64, SSEPackedDouble>, PD;
   2575 }
   2576 
   2577 let Predicates = [HasAVX, NoVLX] in {
   2578   def : Pat<(v4i32 (X86Shufp VR128:$src1,
   2579                        (bc_v4i32 (loadv2i64 addr:$src2)), (i8 imm:$imm))),
   2580             (VSHUFPSrmi VR128:$src1, addr:$src2, imm:$imm)>;
   2581   def : Pat<(v4i32 (X86Shufp VR128:$src1, VR128:$src2, (i8 imm:$imm))),
   2582             (VSHUFPSrri VR128:$src1, VR128:$src2, imm:$imm)>;
   2583 
   2584   def : Pat<(v2i64 (X86Shufp VR128:$src1,
   2585                        (loadv2i64 addr:$src2), (i8 imm:$imm))),
   2586             (VSHUFPDrmi VR128:$src1, addr:$src2, imm:$imm)>;
   2587   def : Pat<(v2i64 (X86Shufp VR128:$src1, VR128:$src2, (i8 imm:$imm))),
   2588             (VSHUFPDrri VR128:$src1, VR128:$src2, imm:$imm)>;
   2589 
   2590   // 256-bit patterns
   2591   def : Pat<(v8i32 (X86Shufp VR256:$src1, VR256:$src2, (i8 imm:$imm))),
   2592             (VSHUFPSYrri VR256:$src1, VR256:$src2, imm:$imm)>;
   2593   def : Pat<(v8i32 (X86Shufp VR256:$src1,
   2594                       (bc_v8i32 (loadv4i64 addr:$src2)), (i8 imm:$imm))),
   2595             (VSHUFPSYrmi VR256:$src1, addr:$src2, imm:$imm)>;
   2596 
   2597   def : Pat<(v4i64 (X86Shufp VR256:$src1, VR256:$src2, (i8 imm:$imm))),
   2598             (VSHUFPDYrri VR256:$src1, VR256:$src2, imm:$imm)>;
   2599   def : Pat<(v4i64 (X86Shufp VR256:$src1,
   2600                               (loadv4i64 addr:$src2), (i8 imm:$imm))),
   2601             (VSHUFPDYrmi VR256:$src1, addr:$src2, imm:$imm)>;
   2602 }
   2603 
   2604 let Predicates = [UseSSE1] in {
   2605   def : Pat<(v4i32 (X86Shufp VR128:$src1,
   2606                        (bc_v4i32 (memopv2i64 addr:$src2)), (i8 imm:$imm))),
   2607             (SHUFPSrmi VR128:$src1, addr:$src2, imm:$imm)>;
   2608   def : Pat<(v4i32 (X86Shufp VR128:$src1, VR128:$src2, (i8 imm:$imm))),
   2609             (SHUFPSrri VR128:$src1, VR128:$src2, imm:$imm)>;
   2610 }
   2611 
   2612 let Predicates = [UseSSE2] in {
   2613   // Generic SHUFPD patterns
   2614   def : Pat<(v2i64 (X86Shufp VR128:$src1,
   2615                        (memopv2i64 addr:$src2), (i8 imm:$imm))),
   2616             (SHUFPDrmi VR128:$src1, addr:$src2, imm:$imm)>;
   2617   def : Pat<(v2i64 (X86Shufp VR128:$src1, VR128:$src2, (i8 imm:$imm))),
   2618             (SHUFPDrri VR128:$src1, VR128:$src2, imm:$imm)>;
   2619 }
   2620 
   2621 //===----------------------------------------------------------------------===//
   2622 // SSE 1 & 2 - Unpack FP Instructions
   2623 //===----------------------------------------------------------------------===//
   2624 
   2625 /// sse12_unpack_interleave - sse 1 & 2 fp unpack and interleave
   2626 multiclass sse12_unpack_interleave<bits<8> opc, SDNode OpNode, ValueType vt,
   2627                                    PatFrag mem_frag, RegisterClass RC,
   2628                                    X86MemOperand x86memop, string asm,
   2629                                    Domain d> {
   2630     def rr : PI<opc, MRMSrcReg,
   2631                 (outs RC:$dst), (ins RC:$src1, RC:$src2),
   2632                 asm, [(set RC:$dst,
   2633                            (vt (OpNode RC:$src1, RC:$src2)))],
   2634                            IIC_SSE_UNPCK, d>, Sched<[WriteFShuffle]>;
   2635     def rm : PI<opc, MRMSrcMem,
   2636                 (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
   2637                 asm, [(set RC:$dst,
   2638                            (vt (OpNode RC:$src1,
   2639                                        (mem_frag addr:$src2))))],
   2640                                        IIC_SSE_UNPCK, d>,
   2641              Sched<[WriteFShuffleLd, ReadAfterLd]>;
   2642 }
   2643 
   2644 let Predicates = [HasAVX, NoVLX] in {
   2645 defm VUNPCKHPS: sse12_unpack_interleave<0x15, X86Unpckh, v4f32, loadv4f32,
   2646       VR128, f128mem, "unpckhps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
   2647                      SSEPackedSingle>, PS, VEX_4V;
   2648 defm VUNPCKHPD: sse12_unpack_interleave<0x15, X86Unpckh, v2f64, loadv2f64,
   2649       VR128, f128mem, "unpckhpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
   2650                      SSEPackedDouble>, PD, VEX_4V;
   2651 defm VUNPCKLPS: sse12_unpack_interleave<0x14, X86Unpckl, v4f32, loadv4f32,
   2652       VR128, f128mem, "unpcklps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
   2653                      SSEPackedSingle>, PS, VEX_4V;
   2654 defm VUNPCKLPD: sse12_unpack_interleave<0x14, X86Unpckl, v2f64, loadv2f64,
   2655       VR128, f128mem, "unpcklpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
   2656                      SSEPackedDouble>, PD, VEX_4V;
   2657 
   2658 defm VUNPCKHPSY: sse12_unpack_interleave<0x15, X86Unpckh, v8f32, loadv8f32,
   2659       VR256, f256mem, "unpckhps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
   2660                      SSEPackedSingle>, PS, VEX_4V, VEX_L;
   2661 defm VUNPCKHPDY: sse12_unpack_interleave<0x15, X86Unpckh, v4f64, loadv4f64,
   2662       VR256, f256mem, "unpckhpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
   2663                      SSEPackedDouble>, PD, VEX_4V, VEX_L;
   2664 defm VUNPCKLPSY: sse12_unpack_interleave<0x14, X86Unpckl, v8f32, loadv8f32,
   2665       VR256, f256mem, "unpcklps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
   2666                      SSEPackedSingle>, PS, VEX_4V, VEX_L;
   2667 defm VUNPCKLPDY: sse12_unpack_interleave<0x14, X86Unpckl, v4f64, loadv4f64,
   2668       VR256, f256mem, "unpcklpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
   2669                      SSEPackedDouble>, PD, VEX_4V, VEX_L;
   2670 }// Predicates = [HasAVX, NoVLX]
   2671 let Constraints = "$src1 = $dst" in {
   2672   defm UNPCKHPS: sse12_unpack_interleave<0x15, X86Unpckh, v4f32, memopv4f32,
   2673         VR128, f128mem, "unpckhps\t{$src2, $dst|$dst, $src2}",
   2674                        SSEPackedSingle>, PS;
   2675   defm UNPCKHPD: sse12_unpack_interleave<0x15, X86Unpckh, v2f64, memopv2f64,
   2676         VR128, f128mem, "unpckhpd\t{$src2, $dst|$dst, $src2}",
   2677                        SSEPackedDouble>, PD;
   2678   defm UNPCKLPS: sse12_unpack_interleave<0x14, X86Unpckl, v4f32, memopv4f32,
   2679         VR128, f128mem, "unpcklps\t{$src2, $dst|$dst, $src2}",
   2680                        SSEPackedSingle>, PS;
   2681   defm UNPCKLPD: sse12_unpack_interleave<0x14, X86Unpckl, v2f64, memopv2f64,
   2682         VR128, f128mem, "unpcklpd\t{$src2, $dst|$dst, $src2}",
   2683                        SSEPackedDouble>, PD;
   2684 } // Constraints = "$src1 = $dst"
   2685 
   2686 let Predicates = [HasAVX1Only] in {
   2687   def : Pat<(v8i32 (X86Unpckl VR256:$src1, (bc_v8i32 (loadv4i64 addr:$src2)))),
   2688             (VUNPCKLPSYrm VR256:$src1, addr:$src2)>;
   2689   def : Pat<(v8i32 (X86Unpckl VR256:$src1, VR256:$src2)),
   2690             (VUNPCKLPSYrr VR256:$src1, VR256:$src2)>;
   2691   def : Pat<(v8i32 (X86Unpckh VR256:$src1, (bc_v8i32 (loadv4i64 addr:$src2)))),
   2692             (VUNPCKHPSYrm VR256:$src1, addr:$src2)>;
   2693   def : Pat<(v8i32 (X86Unpckh VR256:$src1, VR256:$src2)),
   2694             (VUNPCKHPSYrr VR256:$src1, VR256:$src2)>;
   2695 
   2696   def : Pat<(v4i64 (X86Unpckl VR256:$src1, (loadv4i64 addr:$src2))),
   2697             (VUNPCKLPDYrm VR256:$src1, addr:$src2)>;
   2698   def : Pat<(v4i64 (X86Unpckl VR256:$src1, VR256:$src2)),
   2699             (VUNPCKLPDYrr VR256:$src1, VR256:$src2)>;
   2700   def : Pat<(v4i64 (X86Unpckh VR256:$src1, (loadv4i64 addr:$src2))),
   2701             (VUNPCKHPDYrm VR256:$src1, addr:$src2)>;
   2702   def : Pat<(v4i64 (X86Unpckh VR256:$src1, VR256:$src2)),
   2703             (VUNPCKHPDYrr VR256:$src1, VR256:$src2)>;
   2704 }
   2705 
   2706 //===----------------------------------------------------------------------===//
   2707 // SSE 1 & 2 - Extract Floating-Point Sign mask
   2708 //===----------------------------------------------------------------------===//
   2709 
   2710 /// sse12_extr_sign_mask - sse 1 & 2 unpack and interleave
   2711 multiclass sse12_extr_sign_mask<RegisterClass RC, ValueType vt,
   2712                                 string asm, Domain d> {
   2713   def rr : PI<0x50, MRMSrcReg, (outs GR32orGR64:$dst), (ins RC:$src),
   2714               !strconcat(asm, "\t{$src, $dst|$dst, $src}"),
   2715               [(set GR32orGR64:$dst, (X86movmsk (vt RC:$src)))], IIC_SSE_MOVMSK, d>,
   2716               Sched<[WriteVecLogic]>;
   2717 }
   2718 
   2719 let Predicates = [HasAVX] in {
   2720   defm VMOVMSKPS : sse12_extr_sign_mask<VR128, v4f32, "movmskps",
   2721                                         SSEPackedSingle>, PS, VEX;
   2722   defm VMOVMSKPD : sse12_extr_sign_mask<VR128, v2f64, "movmskpd",
   2723                                         SSEPackedDouble>, PD, VEX;
   2724   defm VMOVMSKPSY : sse12_extr_sign_mask<VR256, v8f32, "movmskps",
   2725                                          SSEPackedSingle>, PS, VEX, VEX_L;
   2726   defm VMOVMSKPDY : sse12_extr_sign_mask<VR256, v4f64, "movmskpd",
   2727                                          SSEPackedDouble>, PD, VEX, VEX_L;
   2728 }
   2729 
   2730 defm MOVMSKPS : sse12_extr_sign_mask<VR128, v4f32, "movmskps",
   2731                                      SSEPackedSingle>, PS;
   2732 defm MOVMSKPD : sse12_extr_sign_mask<VR128, v2f64, "movmskpd",
   2733                                      SSEPackedDouble>, PD;
   2734 
   2735 //===---------------------------------------------------------------------===//
   2736 // SSE2 - Packed Integer Logical Instructions
   2737 //===---------------------------------------------------------------------===//
   2738 
   2739 let ExeDomain = SSEPackedInt in { // SSE integer instructions
   2740 
   2741 /// PDI_binop_rm - Simple SSE2 binary operator.
   2742 multiclass PDI_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
   2743                         ValueType OpVT, RegisterClass RC, PatFrag memop_frag,
   2744                         X86MemOperand x86memop, OpndItins itins,
   2745                         bit IsCommutable, bit Is2Addr> {
   2746   let isCommutable = IsCommutable in
   2747   def rr : PDI<opc, MRMSrcReg, (outs RC:$dst),
   2748        (ins RC:$src1, RC:$src2),
   2749        !if(Is2Addr,
   2750            !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
   2751            !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
   2752        [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2)))], itins.rr>,
   2753        Sched<[itins.Sched]>;
   2754   def rm : PDI<opc, MRMSrcMem, (outs RC:$dst),
   2755        (ins RC:$src1, x86memop:$src2),
   2756        !if(Is2Addr,
   2757            !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
   2758            !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
   2759        [(set RC:$dst, (OpVT (OpNode RC:$src1,
   2760                                      (bitconvert (memop_frag addr:$src2)))))],
   2761                                      itins.rm>,
   2762        Sched<[itins.Sched.Folded, ReadAfterLd]>;
   2763 }
   2764 } // ExeDomain = SSEPackedInt
   2765 
   2766 multiclass PDI_binop_all<bits<8> opc, string OpcodeStr, SDNode Opcode,
   2767                          ValueType OpVT128, ValueType OpVT256,
   2768                          OpndItins itins, bit IsCommutable = 0, Predicate prd> {
   2769 let Predicates = [HasAVX, prd] in
   2770   defm V#NAME : PDI_binop_rm<opc, !strconcat("v", OpcodeStr), Opcode, OpVT128,
   2771                     VR128, loadv2i64, i128mem, itins, IsCommutable, 0>, VEX_4V;
   2772 
   2773 let Constraints = "$src1 = $dst" in
   2774   defm NAME : PDI_binop_rm<opc, OpcodeStr, Opcode, OpVT128, VR128,
   2775                            memopv2i64, i128mem, itins, IsCommutable, 1>;
   2776 
   2777 let Predicates = [HasAVX2, prd] in
   2778   defm V#NAME#Y : PDI_binop_rm<opc, !strconcat("v", OpcodeStr), Opcode,
   2779                                OpVT256, VR256, loadv4i64, i256mem, itins,
   2780                                IsCommutable, 0>, VEX_4V, VEX_L;
   2781 }
   2782 
   2783 // These are ordered here for pattern ordering requirements with the fp versions
   2784 
   2785 defm PAND  : PDI_binop_all<0xDB, "pand", and, v2i64, v4i64,
   2786                            SSE_VEC_BIT_ITINS_P, 1, NoVLX>;
   2787 defm POR   : PDI_binop_all<0xEB, "por", or, v2i64, v4i64,
   2788                            SSE_VEC_BIT_ITINS_P, 1, NoVLX>;
   2789 defm PXOR  : PDI_binop_all<0xEF, "pxor", xor, v2i64, v4i64,
   2790                            SSE_VEC_BIT_ITINS_P, 1, NoVLX>;
   2791 defm PANDN : PDI_binop_all<0xDF, "pandn", X86andnp, v2i64, v4i64,
   2792                            SSE_VEC_BIT_ITINS_P, 0, NoVLX>;
   2793 
   2794 //===----------------------------------------------------------------------===//
   2795 // SSE 1 & 2 - Logical Instructions
   2796 //===----------------------------------------------------------------------===//
   2797 
   2798 // Multiclass for scalars using the X86 logical operation aliases for FP.
   2799 multiclass sse12_fp_packed_scalar_logical_alias<
   2800     bits<8> opc, string OpcodeStr, SDNode OpNode, OpndItins itins> {
   2801   defm V#NAME#PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode,
   2802                 FR32, f32, f128mem, loadf32_128, SSEPackedSingle, itins, 0>,
   2803                 PS, VEX_4V;
   2804 
   2805   defm V#NAME#PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode,
   2806                 FR64, f64, f128mem, loadf64_128, SSEPackedDouble, itins, 0>,
   2807                 PD, VEX_4V;
   2808 
   2809   let Constraints = "$src1 = $dst" in {
   2810     defm PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode, FR32,
   2811                 f32, f128mem, memopfsf32_128, SSEPackedSingle, itins>, PS;
   2812 
   2813     defm PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode, FR64,
   2814                 f64, f128mem, memopfsf64_128, SSEPackedDouble, itins>, PD;
   2815   }
   2816 }
   2817 
   2818 let isCodeGenOnly = 1 in {
   2819   defm FsAND  : sse12_fp_packed_scalar_logical_alias<0x54, "and", X86fand,
   2820                 SSE_BIT_ITINS_P>;
   2821   defm FsOR   : sse12_fp_packed_scalar_logical_alias<0x56, "or", X86for,
   2822                 SSE_BIT_ITINS_P>;
   2823   defm FsXOR  : sse12_fp_packed_scalar_logical_alias<0x57, "xor", X86fxor,
   2824                 SSE_BIT_ITINS_P>;
   2825 
   2826   let isCommutable = 0 in
   2827     defm FsANDN : sse12_fp_packed_scalar_logical_alias<0x55, "andn", X86fandn,
   2828                   SSE_BIT_ITINS_P>;
   2829 }
   2830 
   2831 // Multiclass for vectors using the X86 logical operation aliases for FP.
   2832 multiclass sse12_fp_packed_vector_logical_alias<
   2833     bits<8> opc, string OpcodeStr, SDNode OpNode, OpndItins itins> {
   2834   let Predicates = [HasAVX, NoVLX_Or_NoDQI] in {
   2835   defm V#NAME#PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode,
   2836               VR128, v4f32, f128mem, loadv4f32, SSEPackedSingle, itins, 0>,
   2837               PS, VEX_4V;
   2838 
   2839   defm V#NAME#PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode,
   2840         VR128, v2f64, f128mem, loadv2f64, SSEPackedDouble, itins, 0>,
   2841         PD, VEX_4V;
   2842 
   2843   defm V#NAME#PSY : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode,
   2844         VR256, v8f32, f256mem, loadv8f32, SSEPackedSingle, itins, 0>,
   2845         PS, VEX_4V, VEX_L;
   2846 
   2847   defm V#NAME#PDY : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode,
   2848         VR256, v4f64, f256mem, loadv4f64, SSEPackedDouble, itins, 0>,
   2849         PD, VEX_4V, VEX_L;
   2850   }
   2851 
   2852   let Constraints = "$src1 = $dst" in {
   2853     defm PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode, VR128,
   2854                 v4f32, f128mem, memopv4f32, SSEPackedSingle, itins>,
   2855                 PS;
   2856 
   2857     defm PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode, VR128,
   2858                 v2f64, f128mem, memopv2f64, SSEPackedDouble, itins>,
   2859                 PD;
   2860   }
   2861 }
   2862 
   2863 let isCodeGenOnly = 1 in {
   2864   defm FvAND  : sse12_fp_packed_vector_logical_alias<0x54, "and", X86fand,
   2865                 SSE_BIT_ITINS_P>;
   2866   defm FvOR   : sse12_fp_packed_vector_logical_alias<0x56, "or", X86for,
   2867                 SSE_BIT_ITINS_P>;
   2868   defm FvXOR  : sse12_fp_packed_vector_logical_alias<0x57, "xor", X86fxor,
   2869                 SSE_BIT_ITINS_P>;
   2870 
   2871   let isCommutable = 0 in
   2872     defm FvANDN : sse12_fp_packed_vector_logical_alias<0x55, "andn", X86fandn,
   2873                   SSE_BIT_ITINS_P>;
   2874 }
   2875 
   2876 /// sse12_fp_packed_logical - SSE 1 & 2 packed FP logical ops
   2877 ///
   2878 multiclass sse12_fp_packed_logical<bits<8> opc, string OpcodeStr,
   2879                                    SDNode OpNode> {
   2880   let Predicates = [HasAVX, NoVLX] in {
   2881   defm V#NAME#PSY : sse12_fp_packed_logical_rm<opc, VR256, SSEPackedSingle,
   2882         !strconcat(OpcodeStr, "ps"), f256mem,
   2883         [(set VR256:$dst, (v4i64 (OpNode VR256:$src1, VR256:$src2)))],
   2884         [(set VR256:$dst, (OpNode (bc_v4i64 (v8f32 VR256:$src1)),
   2885                            (loadv4i64 addr:$src2)))], 0>, PS, VEX_4V, VEX_L;
   2886 
   2887   defm V#NAME#PDY : sse12_fp_packed_logical_rm<opc, VR256, SSEPackedDouble,
   2888         !strconcat(OpcodeStr, "pd"), f256mem,
   2889         [(set VR256:$dst, (OpNode (bc_v4i64 (v4f64 VR256:$src1)),
   2890                                   (bc_v4i64 (v4f64 VR256:$src2))))],
   2891         [(set VR256:$dst, (OpNode (bc_v4i64 (v4f64 VR256:$src1)),
   2892                                   (loadv4i64 addr:$src2)))], 0>,
   2893                                   PD, VEX_4V, VEX_L;
   2894 
   2895   // In AVX no need to add a pattern for 128-bit logical rr ps, because they
   2896   // are all promoted to v2i64, and the patterns are covered by the int
   2897   // version. This is needed in SSE only, because v2i64 isn't supported on
   2898   // SSE1, but only on SSE2.
   2899   defm V#NAME#PS : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedSingle,
   2900        !strconcat(OpcodeStr, "ps"), f128mem, [],
   2901        [(set VR128:$dst, (OpNode (bc_v2i64 (v4f32 VR128:$src1)),
   2902                                  (loadv2i64 addr:$src2)))], 0>, PS, VEX_4V;
   2903 
   2904   defm V#NAME#PD : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedDouble,
   2905        !strconcat(OpcodeStr, "pd"), f128mem,
   2906        [(set VR128:$dst, (OpNode (bc_v2i64 (v2f64 VR128:$src1)),
   2907                                  (bc_v2i64 (v2f64 VR128:$src2))))],
   2908        [(set VR128:$dst, (OpNode (bc_v2i64 (v2f64 VR128:$src1)),
   2909                                  (loadv2i64 addr:$src2)))], 0>,
   2910                                                  PD, VEX_4V;
   2911   }
   2912 
   2913   let Constraints = "$src1 = $dst" in {
   2914     defm PS : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedSingle,
   2915          !strconcat(OpcodeStr, "ps"), f128mem,
   2916          [(set VR128:$dst, (v2i64 (OpNode VR128:$src1, VR128:$src2)))],
   2917          [(set VR128:$dst, (OpNode (bc_v2i64 (v4f32 VR128:$src1)),
   2918                                    (memopv2i64 addr:$src2)))]>, PS;
   2919 
   2920     defm PD : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedDouble,
   2921          !strconcat(OpcodeStr, "pd"), f128mem,
   2922          [(set VR128:$dst, (OpNode (bc_v2i64 (v2f64 VR128:$src1)),
   2923                                    (bc_v2i64 (v2f64 VR128:$src2))))],
   2924          [(set VR128:$dst, (OpNode (bc_v2i64 (v2f64 VR128:$src1)),
   2925                                    (memopv2i64 addr:$src2)))]>, PD;
   2926   }
   2927 }
   2928 
   2929 defm AND  : sse12_fp_packed_logical<0x54, "and", and>;
   2930 defm OR   : sse12_fp_packed_logical<0x56, "or", or>;
   2931 defm XOR  : sse12_fp_packed_logical<0x57, "xor", xor>;
   2932 let isCommutable = 0 in
   2933   defm ANDN : sse12_fp_packed_logical<0x55, "andn", X86andnp>;
   2934 
   2935 // AVX1 requires type coercions in order to fold loads directly into logical
   2936 // operations.
   2937 let Predicates = [HasAVX1Only] in {
   2938   def : Pat<(bc_v8f32 (and VR256:$src1, (loadv4i64 addr:$src2))),
   2939             (VANDPSYrm VR256:$src1, addr:$src2)>;
   2940   def : Pat<(bc_v8f32 (or VR256:$src1, (loadv4i64 addr:$src2))),
   2941             (VORPSYrm VR256:$src1, addr:$src2)>;
   2942   def : Pat<(bc_v8f32 (xor VR256:$src1, (loadv4i64 addr:$src2))),
   2943             (VXORPSYrm VR256:$src1, addr:$src2)>;
   2944   def : Pat<(bc_v8f32 (X86andnp VR256:$src1, (loadv4i64 addr:$src2))),
   2945             (VANDNPSYrm VR256:$src1, addr:$src2)>;
   2946 }
   2947 
   2948 //===----------------------------------------------------------------------===//
   2949 // SSE 1 & 2 - Arithmetic Instructions
   2950 //===----------------------------------------------------------------------===//
   2951 
   2952 /// basic_sse12_fp_binop_xxx - SSE 1 & 2 binops come in both scalar and
   2953 /// vector forms.
   2954 ///
   2955 /// In addition, we also have a special variant of the scalar form here to
   2956 /// represent the associated intrinsic operation.  This form is unlike the
   2957 /// plain scalar form, in that it takes an entire vector (instead of a scalar)
   2958 /// and leaves the top elements unmodified (therefore these cannot be commuted).
   2959 ///
   2960 /// These three forms can each be reg+reg or reg+mem.
   2961 ///
   2962 
   2963 /// FIXME: once all 256-bit intrinsics are matched, cleanup and refactor those
   2964 /// classes below
   2965 multiclass basic_sse12_fp_binop_p<bits<8> opc, string OpcodeStr,
   2966                                   SDNode OpNode, SizeItins itins> {
   2967   let Predicates = [HasAVX, NoVLX] in {
   2968   defm V#NAME#PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode,
   2969                                VR128, v4f32, f128mem, loadv4f32,
   2970                                SSEPackedSingle, itins.s, 0>, PS, VEX_4V;
   2971   defm V#NAME#PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode,
   2972                                VR128, v2f64, f128mem, loadv2f64,
   2973                                SSEPackedDouble, itins.d, 0>, PD, VEX_4V;
   2974 
   2975   defm V#NAME#PSY : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"),
   2976                         OpNode, VR256, v8f32, f256mem, loadv8f32,
   2977                         SSEPackedSingle, itins.s, 0>, PS, VEX_4V, VEX_L;
   2978   defm V#NAME#PDY : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"),
   2979                         OpNode, VR256, v4f64, f256mem, loadv4f64,
   2980                         SSEPackedDouble, itins.d, 0>, PD, VEX_4V, VEX_L;
   2981   }
   2982 
   2983   let Constraints = "$src1 = $dst" in {
   2984     defm PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode, VR128,
   2985                               v4f32, f128mem, memopv4f32, SSEPackedSingle,
   2986                               itins.s>, PS;
   2987     defm PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode, VR128,
   2988                               v2f64, f128mem, memopv2f64, SSEPackedDouble,
   2989                               itins.d>, PD;
   2990   }
   2991 }
   2992 
   2993 multiclass basic_sse12_fp_binop_s<bits<8> opc, string OpcodeStr, SDNode OpNode,
   2994                                   SizeItins itins> {
   2995   defm V#NAME#SS : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "ss"),
   2996                          OpNode, FR32, f32mem, SSEPackedSingle, itins.s, 0>,
   2997                          XS, VEX_4V, VEX_LIG;
   2998   defm V#NAME#SD : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "sd"),
   2999                          OpNode, FR64, f64mem, SSEPackedDouble, itins.d, 0>,
   3000                          XD, VEX_4V, VEX_LIG;
   3001 
   3002   let Constraints = "$src1 = $dst" in {
   3003     defm SS : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "ss"),
   3004                               OpNode, FR32, f32mem, SSEPackedSingle,
   3005                               itins.s>, XS;
   3006     defm SD : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "sd"),
   3007                               OpNode, FR64, f64mem, SSEPackedDouble,
   3008                               itins.d>, XD;
   3009   }
   3010 }
   3011 
   3012 multiclass basic_sse12_fp_binop_s_int<bits<8> opc, string OpcodeStr,
   3013                                       SizeItins itins> {
   3014   defm V#NAME#SS : sse12_fp_scalar_int<opc, OpcodeStr, VR128,
   3015                    !strconcat(OpcodeStr, "ss"), "", "_ss", ssmem, sse_load_f32,
   3016                    SSEPackedSingle, itins.s, 0>, XS, VEX_4V, VEX_LIG;
   3017   defm V#NAME#SD : sse12_fp_scalar_int<opc, OpcodeStr, VR128,
   3018                    !strconcat(OpcodeStr, "sd"), "2", "_sd", sdmem, sse_load_f64,
   3019                    SSEPackedDouble, itins.d, 0>, XD, VEX_4V, VEX_LIG;
   3020 
   3021   let Constraints = "$src1 = $dst" in {
   3022     defm SS : sse12_fp_scalar_int<opc, OpcodeStr, VR128,
   3023                    !strconcat(OpcodeStr, "ss"), "", "_ss", ssmem, sse_load_f32,
   3024                    SSEPackedSingle, itins.s>, XS;
   3025     defm SD : sse12_fp_scalar_int<opc, OpcodeStr, VR128,
   3026                    !strconcat(OpcodeStr, "sd"), "2", "_sd", sdmem, sse_load_f64,
   3027                    SSEPackedDouble, itins.d>, XD;
   3028   }
   3029 }
   3030 
   3031 // Binary Arithmetic instructions
   3032 defm ADD : basic_sse12_fp_binop_p<0x58, "add", fadd, SSE_ALU_ITINS_P>,
   3033            basic_sse12_fp_binop_s<0x58, "add", fadd, SSE_ALU_ITINS_S>,
   3034            basic_sse12_fp_binop_s_int<0x58, "add", SSE_ALU_ITINS_S>;
   3035 defm MUL : basic_sse12_fp_binop_p<0x59, "mul", fmul, SSE_MUL_ITINS_P>,
   3036            basic_sse12_fp_binop_s<0x59, "mul", fmul, SSE_MUL_ITINS_S>,
   3037            basic_sse12_fp_binop_s_int<0x59, "mul", SSE_MUL_ITINS_S>;
   3038 let isCommutable = 0 in {
   3039   defm SUB : basic_sse12_fp_binop_p<0x5C, "sub", fsub, SSE_ALU_ITINS_P>,
   3040              basic_sse12_fp_binop_s<0x5C, "sub", fsub, SSE_ALU_ITINS_S>,
   3041              basic_sse12_fp_binop_s_int<0x5C, "sub", SSE_ALU_ITINS_S>;
   3042   defm DIV : basic_sse12_fp_binop_p<0x5E, "div", fdiv, SSE_DIV_ITINS_P>,
   3043              basic_sse12_fp_binop_s<0x5E, "div", fdiv, SSE_DIV_ITINS_S>,
   3044              basic_sse12_fp_binop_s_int<0x5E, "div", SSE_DIV_ITINS_S>;
   3045   defm MAX : basic_sse12_fp_binop_p<0x5F, "max", X86fmax, SSE_ALU_ITINS_P>,
   3046              basic_sse12_fp_binop_s<0x5F, "max", X86fmax, SSE_ALU_ITINS_S>,
   3047              basic_sse12_fp_binop_s_int<0x5F, "max", SSE_ALU_ITINS_S>;
   3048   defm MIN : basic_sse12_fp_binop_p<0x5D, "min", X86fmin, SSE_ALU_ITINS_P>,
   3049              basic_sse12_fp_binop_s<0x5D, "min", X86fmin, SSE_ALU_ITINS_S>,
   3050              basic_sse12_fp_binop_s_int<0x5D, "min", SSE_ALU_ITINS_S>;
   3051 }
   3052 
   3053 let isCodeGenOnly = 1 in {
   3054   defm MAXC: basic_sse12_fp_binop_p<0x5F, "max", X86fmaxc, SSE_ALU_ITINS_P>,
   3055              basic_sse12_fp_binop_s<0x5F, "max", X86fmaxc, SSE_ALU_ITINS_S>;
   3056   defm MINC: basic_sse12_fp_binop_p<0x5D, "min", X86fminc, SSE_ALU_ITINS_P>,
   3057              basic_sse12_fp_binop_s<0x5D, "min", X86fminc, SSE_ALU_ITINS_S>;
   3058 }
   3059 
   3060 // Patterns used to select SSE scalar fp arithmetic instructions from
   3061 // either:
   3062 //
   3063 // (1) a scalar fp operation followed by a blend
   3064 //
   3065 // The effect is that the backend no longer emits unnecessary vector
   3066 // insert instructions immediately after SSE scalar fp instructions
   3067 // like addss or mulss.
   3068 //
   3069 // For example, given the following code:
   3070 //   __m128 foo(__m128 A, __m128 B) {
   3071 //     A[0] += B[0];
   3072 //     return A;
   3073 //   }
   3074 //
   3075 // Previously we generated:
   3076 //   addss %xmm0, %xmm1
   3077 //   movss %xmm1, %xmm0
   3078 //
   3079 // We now generate:
   3080 //   addss %xmm1, %xmm0
   3081 //
   3082 // (2) a vector packed single/double fp operation followed by a vector insert
   3083 //
   3084 // The effect is that the backend converts the packed fp instruction
   3085 // followed by a vector insert into a single SSE scalar fp instruction.
   3086 //
   3087 // For example, given the following code:
   3088 //   __m128 foo(__m128 A, __m128 B) {
   3089 //     __m128 C = A + B;
   3090 //     return (__m128) {c[0], a[1], a[2], a[3]};
   3091 //   }
   3092 //
   3093 // Previously we generated:
   3094 //   addps %xmm0, %xmm1
   3095 //   movss %xmm1, %xmm0
   3096 //
   3097 // We now generate:
   3098 //   addss %xmm1, %xmm0
   3099 
   3100 // TODO: Some canonicalization in lowering would simplify the number of
   3101 // patterns we have to try to match.
   3102 multiclass scalar_math_f32_patterns<SDNode Op, string OpcPrefix> {
   3103   let Predicates = [UseSSE1] in {
   3104     // extracted scalar math op with insert via movss
   3105     def : Pat<(v4f32 (X86Movss (v4f32 VR128:$dst), (v4f32 (scalar_to_vector
   3106           (Op (f32 (extractelt (v4f32 VR128:$dst), (iPTR 0))),
   3107           FR32:$src))))),
   3108       (!cast<I>(OpcPrefix#SSrr_Int) v4f32:$dst,
   3109           (COPY_TO_REGCLASS FR32:$src, VR128))>;
   3110 
   3111     // vector math op with insert via movss
   3112     def : Pat<(v4f32 (X86Movss (v4f32 VR128:$dst),
   3113           (Op (v4f32 VR128:$dst), (v4f32 VR128:$src)))),
   3114       (!cast<I>(OpcPrefix#SSrr_Int) v4f32:$dst, v4f32:$src)>;
   3115   }
   3116 
   3117   // With SSE 4.1, blendi is preferred to movsd, so match that too.
   3118   let Predicates = [UseSSE41] in {
   3119     // extracted scalar math op with insert via blend
   3120     def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), (v4f32 (scalar_to_vector
   3121           (Op (f32 (extractelt (v4f32 VR128:$dst), (iPTR 0))),
   3122           FR32:$src))), (i8 1))),
   3123       (!cast<I>(OpcPrefix#SSrr_Int) v4f32:$dst,
   3124           (COPY_TO_REGCLASS FR32:$src, VR128))>;
   3125 
   3126     // vector math op with insert via blend
   3127     def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst),
   3128           (Op (v4f32 VR128:$dst), (v4f32 VR128:$src)), (i8 1))),
   3129       (!cast<I>(OpcPrefix#SSrr_Int)v4f32:$dst, v4f32:$src)>;
   3130 
   3131   }
   3132 
   3133   // Repeat everything for AVX, except for the movss + scalar combo...
   3134   // because that one shouldn't occur with AVX codegen?
   3135   let Predicates = [HasAVX] in {
   3136     // extracted scalar math op with insert via blend
   3137     def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), (v4f32 (scalar_to_vector
   3138           (Op (f32 (extractelt (v4f32 VR128:$dst), (iPTR 0))),
   3139           FR32:$src))), (i8 1))),
   3140       (!cast<I>("V"#OpcPrefix#SSrr_Int) v4f32:$dst,
   3141           (COPY_TO_REGCLASS FR32:$src, VR128))>;
   3142 
   3143     // vector math op with insert via movss
   3144     def : Pat<(v4f32 (X86Movss (v4f32 VR128:$dst),
   3145           (Op (v4f32 VR128:$dst), (v4f32 VR128:$src)))),
   3146       (!cast<I>("V"#OpcPrefix#SSrr_Int) v4f32:$dst, v4f32:$src)>;
   3147 
   3148     // vector math op with insert via blend
   3149     def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst),
   3150           (Op (v4f32 VR128:$dst), (v4f32 VR128:$src)), (i8 1))),
   3151       (!cast<I>("V"#OpcPrefix#SSrr_Int) v4f32:$dst, v4f32:$src)>;
   3152   }
   3153 }
   3154 
   3155 defm : scalar_math_f32_patterns<fadd, "ADD">;
   3156 defm : scalar_math_f32_patterns<fsub, "SUB">;
   3157 defm : scalar_math_f32_patterns<fmul, "MUL">;
   3158 defm : scalar_math_f32_patterns<fdiv, "DIV">;
   3159 
   3160 multiclass scalar_math_f64_patterns<SDNode Op, string OpcPrefix> {
   3161   let Predicates = [UseSSE2] in {
   3162     // extracted scalar math op with insert via movsd
   3163     def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst), (v2f64 (scalar_to_vector
   3164           (Op (f64 (extractelt (v2f64 VR128:$dst), (iPTR 0))),
   3165           FR64:$src))))),
   3166       (!cast<I>(OpcPrefix#SDrr_Int) v2f64:$dst,
   3167           (COPY_TO_REGCLASS FR64:$src, VR128))>;
   3168 
   3169     // vector math op with insert via movsd
   3170     def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst),
   3171           (Op (v2f64 VR128:$dst), (v2f64 VR128:$src)))),
   3172       (!cast<I>(OpcPrefix#SDrr_Int) v2f64:$dst, v2f64:$src)>;
   3173   }
   3174 
   3175   // With SSE 4.1, blendi is preferred to movsd, so match those too.
   3176   let Predicates = [UseSSE41] in {
   3177     // extracted scalar math op with insert via blend
   3178     def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), (v2f64 (scalar_to_vector
   3179           (Op (f64 (extractelt (v2f64 VR128:$dst), (iPTR 0))),
   3180           FR64:$src))), (i8 1))),
   3181       (!cast<I>(OpcPrefix#SDrr_Int) v2f64:$dst,
   3182           (COPY_TO_REGCLASS FR64:$src, VR128))>;
   3183 
   3184     // vector math op with insert via blend
   3185     def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst),
   3186           (Op (v2f64 VR128:$dst), (v2f64 VR128:$src)), (i8 1))),
   3187       (!cast<I>(OpcPrefix#SDrr_Int) v2f64:$dst, v2f64:$src)>;
   3188   }
   3189 
   3190   // Repeat everything for AVX.
   3191   let Predicates = [HasAVX] in {
   3192     // extracted scalar math op with insert via movsd
   3193     def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst), (v2f64 (scalar_to_vector
   3194           (Op (f64 (extractelt (v2f64 VR128:$dst), (iPTR 0))),
   3195           FR64:$src))))),
   3196       (!cast<I>("V"#OpcPrefix#SDrr_Int) v2f64:$dst,
   3197           (COPY_TO_REGCLASS FR64:$src, VR128))>;
   3198 
   3199     // extracted scalar math op with insert via blend
   3200     def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), (v2f64 (scalar_to_vector
   3201           (Op (f64 (extractelt (v2f64 VR128:$dst), (iPTR 0))),
   3202           FR64:$src))), (i8 1))),
   3203       (!cast<I>("V"#OpcPrefix#SDrr_Int) v2f64:$dst,
   3204           (COPY_TO_REGCLASS FR64:$src, VR128))>;
   3205 
   3206     // vector math op with insert via movsd
   3207     def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst),
   3208           (Op (v2f64 VR128:$dst), (v2f64 VR128:$src)))),
   3209       (!cast<I>("V"#OpcPrefix#SDrr_Int) v2f64:$dst, v2f64:$src)>;
   3210 
   3211     // vector math op with insert via blend
   3212     def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst),
   3213           (Op (v2f64 VR128:$dst), (v2f64 VR128:$src)), (i8 1))),
   3214       (!cast<I>("V"#OpcPrefix#SDrr_Int) v2f64:$dst, v2f64:$src)>;
   3215   }
   3216 }
   3217 
   3218 defm : scalar_math_f64_patterns<fadd, "ADD">;
   3219 defm : scalar_math_f64_patterns<fsub, "SUB">;
   3220 defm : scalar_math_f64_patterns<fmul, "MUL">;
   3221 defm : scalar_math_f64_patterns<fdiv, "DIV">;
   3222 
   3223 
   3224 /// Unop Arithmetic
   3225 /// In addition, we also have a special variant of the scalar form here to
   3226 /// represent the associated intrinsic operation.  This form is unlike the
   3227 /// plain scalar form, in that it takes an entire vector (instead of a
   3228 /// scalar) and leaves the top elements undefined.
   3229 ///
   3230 /// And, we have a special variant form for a full-vector intrinsic form.
   3231 
   3232 let Sched = WriteFSqrt in {
   3233 def SSE_SQRTPS : OpndItins<
   3234   IIC_SSE_SQRTPS_RR, IIC_SSE_SQRTPS_RM
   3235 >;
   3236 
   3237 def SSE_SQRTSS : OpndItins<
   3238   IIC_SSE_SQRTSS_RR, IIC_SSE_SQRTSS_RM
   3239 >;
   3240 
   3241 def SSE_SQRTPD : OpndItins<
   3242   IIC_SSE_SQRTPD_RR, IIC_SSE_SQRTPD_RM
   3243 >;
   3244 
   3245 def SSE_SQRTSD : OpndItins<
   3246   IIC_SSE_SQRTSD_RR, IIC_SSE_SQRTSD_RM
   3247 >;
   3248 }
   3249 
   3250 let Sched = WriteFRsqrt in {
   3251 def SSE_RSQRTPS : OpndItins<
   3252   IIC_SSE_RSQRTPS_RR, IIC_SSE_RSQRTPS_RM
   3253 >;
   3254 
   3255 def SSE_RSQRTSS : OpndItins<
   3256   IIC_SSE_RSQRTSS_RR, IIC_SSE_RSQRTSS_RM
   3257 >;
   3258 }
   3259 
   3260 let Sched = WriteFRcp in {
   3261 def SSE_RCPP : OpndItins<
   3262   IIC_SSE_RCPP_RR, IIC_SSE_RCPP_RM
   3263 >;
   3264 
   3265 def SSE_RCPS : OpndItins<
   3266   IIC_SSE_RCPS_RR, IIC_SSE_RCPS_RM
   3267 >;
   3268 }
   3269 
   3270 /// sse_fp_unop_s - SSE1 unops in scalar form
   3271 /// For the non-AVX defs, we need $src1 to be tied to $dst because
   3272 /// the HW instructions are 2 operand / destructive.
   3273 multiclass sse_fp_unop_s<bits<8> opc, string OpcodeStr, RegisterClass RC,
   3274                           ValueType vt, ValueType ScalarVT,
   3275                           X86MemOperand x86memop, Operand vec_memop,
   3276                           ComplexPattern mem_cpat, Intrinsic Intr,
   3277                           SDNode OpNode, Domain d, OpndItins itins,
   3278                           Predicate target, string Suffix> {
   3279   let hasSideEffects = 0 in {
   3280   def r : I<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1),
   3281               !strconcat(OpcodeStr, "\t{$src1, $dst|$dst, $src1}"),
   3282             [(set RC:$dst, (OpNode RC:$src1))], itins.rr, d>, Sched<[itins.Sched]>,
   3283             Requires<[target]>;
   3284   let mayLoad = 1 in
   3285   def m : I<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src1),
   3286             !strconcat(OpcodeStr, "\t{$src1, $dst|$dst, $src1}"),
   3287             [(set RC:$dst, (OpNode (load addr:$src1)))], itins.rm, d>,
   3288             Sched<[itins.Sched.Folded, ReadAfterLd]>,
   3289             Requires<[target, OptForSize]>;
   3290 
   3291   let isCodeGenOnly = 1, Constraints = "$src1 = $dst" in {
   3292   def r_Int : I<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
   3293               !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
   3294             []>, Sched<[itins.Sched.Folded, ReadAfterLd]>;
   3295   let mayLoad = 1 in
   3296   def m_Int : I<opc, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, vec_memop:$src2),
   3297               !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
   3298             []>, Sched<[itins.Sched.Folded, ReadAfterLd]>;
   3299   }
   3300   }
   3301 
   3302   let Predicates = [target] in {
   3303   def : Pat<(vt (OpNode mem_cpat:$src)),
   3304             (vt (COPY_TO_REGCLASS (vt (!cast<Instruction>(NAME#Suffix##m_Int)
   3305                  (vt (IMPLICIT_DEF)), mem_cpat:$src)), RC))>;
   3306   // These are unary operations, but they are modeled as having 2 source operands
   3307   // because the high elements of the destination are unchanged in SSE.
   3308   def : Pat<(Intr VR128:$src),
   3309             (!cast<Instruction>(NAME#Suffix##r_Int) VR128:$src, VR128:$src)>;
   3310   def : Pat<(Intr (load addr:$src)),
   3311             (vt (COPY_TO_REGCLASS(!cast<Instruction>(NAME#Suffix##m)
   3312                                       addr:$src), VR128))>;
   3313   }
   3314   // We don't want to fold scalar loads into these instructions unless
   3315   // optimizing for size. This is because the folded instruction will have a
   3316   // partial register update, while the unfolded sequence will not, e.g.
   3317   // movss mem, %xmm0
   3318   // rcpss %xmm0, %xmm0
   3319   // which has a clobber before the rcp, vs.
   3320   // rcpss mem, %xmm0
   3321   let Predicates = [target, OptForSize] in {
   3322     def : Pat<(Intr mem_cpat:$src),
   3323                (!cast<Instruction>(NAME#Suffix##m_Int)
   3324                       (vt (IMPLICIT_DEF)), mem_cpat:$src)>;
   3325   }
   3326 }
   3327 
   3328 multiclass avx_fp_unop_s<bits<8> opc, string OpcodeStr, RegisterClass RC,
   3329                           ValueType vt, ValueType ScalarVT,
   3330                           X86MemOperand x86memop, Operand vec_memop,
   3331                           ComplexPattern mem_cpat,
   3332                           Intrinsic Intr, SDNode OpNode, Domain d,
   3333                           OpndItins itins, string Suffix> {
   3334   let hasSideEffects = 0 in {
   3335   def r : I<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
   3336             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
   3337             [], itins.rr, d>, Sched<[itins.Sched]>;
   3338   let mayLoad = 1 in
   3339   def m : I<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
   3340              !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
   3341             [], itins.rm, d>, Sched<[itins.Sched.Folded, ReadAfterLd]>;
   3342   let isCodeGenOnly = 1 in {
   3343   def r_Int : I<opc, MRMSrcReg, (outs VR128:$dst),
   3344                 (ins VR128:$src1, VR128:$src2),
   3345              !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
   3346              []>, Sched<[itins.Sched.Folded]>;
   3347   let mayLoad = 1 in
   3348   def m_Int : I<opc, MRMSrcMem, (outs VR128:$dst),
   3349                 (ins VR128:$src1, vec_memop:$src2),
   3350              !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
   3351              []>, Sched<[itins.Sched.Folded, ReadAfterLd]>;
   3352   }
   3353   }
   3354 
   3355   // We don't want to fold scalar loads into these instructions unless
   3356   // optimizing for size. This is because the folded instruction will have a
   3357   // partial register update, while the unfolded sequence will not, e.g.
   3358   // vmovss mem, %xmm0
   3359   // vrcpss %xmm0, %xmm0, %xmm0
   3360   // which has a clobber before the rcp, vs.
   3361   // vrcpss mem, %xmm0, %xmm0
   3362   // TODO: In theory, we could fold the load, and avoid the stall caused by
   3363   // the partial register store, either in ExeDepFix or with smarter RA.
   3364   let Predicates = [UseAVX] in {
   3365    def : Pat<(OpNode RC:$src),  (!cast<Instruction>("V"#NAME#Suffix##r)
   3366                                 (ScalarVT (IMPLICIT_DEF)), RC:$src)>;
   3367   }
   3368   let Predicates = [HasAVX] in {
   3369    def : Pat<(Intr VR128:$src),
   3370              (!cast<Instruction>("V"#NAME#Suffix##r_Int) (vt (IMPLICIT_DEF)),
   3371                                  VR128:$src)>;
   3372   }
   3373   let Predicates = [HasAVX, OptForSize] in {
   3374     def : Pat<(Intr mem_cpat:$src),
   3375               (!cast<Instruction>("V"#NAME#Suffix##m_Int)
   3376                     (vt (IMPLICIT_DEF)), mem_cpat:$src)>;
   3377   }
   3378   let Predicates = [UseAVX, OptForSize] in {
   3379     def : Pat<(ScalarVT (OpNode (load addr:$src))),
   3380               (!cast<Instruction>("V"#NAME#Suffix##m) (ScalarVT (IMPLICIT_DEF)),
   3381             addr:$src)>;
   3382     def : Pat<(vt (OpNode mem_cpat:$src)),
   3383               (!cast<Instruction>("V"#NAME#Suffix##m_Int) (vt (IMPLICIT_DEF)),
   3384                                   mem_cpat:$src)>;
   3385   }
   3386 }
   3387 
   3388 /// sse1_fp_unop_p - SSE1 unops in packed form.
   3389 multiclass sse1_fp_unop_p<bits<8> opc, string OpcodeStr, SDNode OpNode,
   3390                           OpndItins itins, list<Predicate> prds> {
   3391 let Predicates = prds in {
   3392   def V#NAME#PSr : PSI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
   3393                        !strconcat("v", OpcodeStr,
   3394                                   "ps\t{$src, $dst|$dst, $src}"),
   3395                        [(set VR128:$dst, (v4f32 (OpNode VR128:$src)))],
   3396                        itins.rr>, VEX, Sched<[itins.Sched]>;
   3397   def V#NAME#PSm : PSI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
   3398                        !strconcat("v", OpcodeStr,
   3399                                   "ps\t{$src, $dst|$dst, $src}"),
   3400                        [(set VR128:$dst, (OpNode (loadv4f32 addr:$src)))],
   3401                        itins.rm>, VEX, Sched<[itins.Sched.Folded]>;
   3402   def V#NAME#PSYr : PSI<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
   3403                         !strconcat("v", OpcodeStr,
   3404                                    "ps\t{$src, $dst|$dst, $src}"),
   3405                         [(set VR256:$dst, (v8f32 (OpNode VR256:$src)))],
   3406                         itins.rr>, VEX, VEX_L, Sched<[itins.Sched]>;
   3407   def V#NAME#PSYm : PSI<opc, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
   3408                         !strconcat("v", OpcodeStr,
   3409                                    "ps\t{$src, $dst|$dst, $src}"),
   3410                         [(set VR256:$dst, (OpNode (loadv8f32 addr:$src)))],
   3411                         itins.rm>, VEX, VEX_L, Sched<[itins.Sched.Folded]>;
   3412 }
   3413 
   3414   def PSr : PSI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
   3415                 !strconcat(OpcodeStr, "ps\t{$src, $dst|$dst, $src}"),
   3416                 [(set VR128:$dst, (v4f32 (OpNode VR128:$src)))], itins.rr>,
   3417             Sched<[itins.Sched]>;
   3418   def PSm : PSI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
   3419                 !strconcat(OpcodeStr, "ps\t{$src, $dst|$dst, $src}"),
   3420                 [(set VR128:$dst, (OpNode (memopv4f32 addr:$src)))], itins.rm>,
   3421             Sched<[itins.Sched.Folded]>;
   3422 }
   3423 
   3424 /// sse2_fp_unop_p - SSE2 unops in vector forms.
   3425 multiclass sse2_fp_unop_p<bits<8> opc, string OpcodeStr,
   3426                           SDNode OpNode, OpndItins itins> {
   3427 let Predicates = [HasAVX] in {
   3428   def V#NAME#PDr : PDI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
   3429                        !strconcat("v", OpcodeStr,
   3430                                   "pd\t{$src, $dst|$dst, $src}"),
   3431                        [(set VR128:$dst, (v2f64 (OpNode VR128:$src)))],
   3432                        itins.rr>, VEX, Sched<[itins.Sched]>;
   3433   def V#NAME#PDm : PDI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
   3434                        !strconcat("v", OpcodeStr,
   3435                                   "pd\t{$src, $dst|$dst, $src}"),
   3436                        [(set VR128:$dst, (OpNode (loadv2f64 addr:$src)))],
   3437                        itins.rm>, VEX, Sched<[itins.Sched.Folded]>;
   3438   def V#NAME#PDYr : PDI<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
   3439                         !strconcat("v", OpcodeStr,
   3440                                    "pd\t{$src, $dst|$dst, $src}"),
   3441                         [(set VR256:$dst, (v4f64 (OpNode VR256:$src)))],
   3442                         itins.rr>, VEX, VEX_L, Sched<[itins.Sched]>;
   3443   def V#NAME#PDYm : PDI<opc, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
   3444                         !strconcat("v", OpcodeStr,
   3445                                    "pd\t{$src, $dst|$dst, $src}"),
   3446                         [(set VR256:$dst, (OpNode (loadv4f64 addr:$src)))],
   3447                         itins.rm>, VEX, VEX_L, Sched<[itins.Sched.Folded]>;
   3448 }
   3449 
   3450   def PDr : PDI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
   3451               !strconcat(OpcodeStr, "pd\t{$src, $dst|$dst, $src}"),
   3452               [(set VR128:$dst, (v2f64 (OpNode VR128:$src)))], itins.rr>,
   3453             Sched<[itins.Sched]>;
   3454   def PDm : PDI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
   3455                 !strconcat(OpcodeStr, "pd\t{$src, $dst|$dst, $src}"),
   3456                 [(set VR128:$dst, (OpNode (memopv2f64 addr:$src)))], itins.rm>,
   3457             Sched<[itins.Sched.Folded]>;
   3458 }
   3459 
   3460 multiclass sse1_fp_unop_s<bits<8> opc, string OpcodeStr, SDNode OpNode,
   3461                           OpndItins itins> {
   3462   defm SS        :  sse_fp_unop_s<opc, OpcodeStr##ss, FR32, v4f32, f32, f32mem,
   3463                       ssmem, sse_load_f32,
   3464                       !cast<Intrinsic>("int_x86_sse_"##OpcodeStr##_ss), OpNode,
   3465                       SSEPackedSingle, itins, UseSSE1, "SS">, XS;
   3466   defm V#NAME#SS  : avx_fp_unop_s<opc, "v"#OpcodeStr##ss, FR32, v4f32, f32,
   3467                       f32mem, ssmem, sse_load_f32,
   3468                       !cast<Intrinsic>("int_x86_sse_"##OpcodeStr##_ss), OpNode,
   3469                       SSEPackedSingle, itins, "SS">, XS, VEX_4V, VEX_LIG;
   3470 }
   3471 
   3472 multiclass sse2_fp_unop_s<bits<8> opc, string OpcodeStr, SDNode OpNode,
   3473                           OpndItins itins> {
   3474   defm SD         : sse_fp_unop_s<opc, OpcodeStr##sd, FR64, v2f64, f64, f64mem,
   3475                          sdmem, sse_load_f64,
   3476                          !cast<Intrinsic>("int_x86_sse2_"##OpcodeStr##_sd),
   3477                          OpNode, SSEPackedDouble, itins, UseSSE2, "SD">, XD;
   3478   defm V#NAME#SD  : avx_fp_unop_s<opc, "v"#OpcodeStr##sd, FR64, v2f64, f64,
   3479                          f64mem, sdmem, sse_load_f64,
   3480                          !cast<Intrinsic>("int_x86_sse2_"##OpcodeStr##_sd),
   3481                          OpNode, SSEPackedDouble, itins, "SD">,
   3482                          XD, VEX_4V, VEX_LIG;
   3483 }
   3484 
   3485 // Square root.
   3486 defm SQRT  : sse1_fp_unop_s<0x51, "sqrt", fsqrt, SSE_SQRTSS>,
   3487              sse1_fp_unop_p<0x51, "sqrt", fsqrt, SSE_SQRTPS, [HasAVX]>,
   3488              sse2_fp_unop_s<0x51, "sqrt", fsqrt, SSE_SQRTSD>,
   3489              sse2_fp_unop_p<0x51, "sqrt", fsqrt, SSE_SQRTPD>;
   3490 
   3491 // Reciprocal approximations. Note that these typically require refinement
   3492 // in order to obtain suitable precision.
   3493 defm RSQRT : sse1_fp_unop_s<0x52, "rsqrt", X86frsqrt, SSE_RSQRTSS>,
   3494              sse1_fp_unop_p<0x52, "rsqrt", X86frsqrt, SSE_RSQRTPS, [HasAVX, NoVLX] >;
   3495 defm RCP   : sse1_fp_unop_s<0x53, "rcp", X86frcp, SSE_RCPS>,
   3496              sse1_fp_unop_p<0x53, "rcp", X86frcp, SSE_RCPP, [HasAVX, NoVLX]>;
   3497 
   3498 // There is no f64 version of the reciprocal approximation instructions.
   3499 
   3500 // TODO: We should add *scalar* op patterns for these just like we have for
   3501 // the binops above. If the binop and unop patterns could all be unified
   3502 // that would be even better.
   3503 
   3504 multiclass scalar_unary_math_patterns<Intrinsic Intr, string OpcPrefix,
   3505                                       SDNode Move, ValueType VT,
   3506                                       Predicate BasePredicate> {
   3507   let Predicates = [BasePredicate] in {
   3508     def : Pat<(VT (Move VT:$dst, (Intr VT:$src))),
   3509               (!cast<I>(OpcPrefix#r_Int) VT:$dst, VT:$src)>;
   3510   }
   3511 
   3512   // With SSE 4.1, blendi is preferred to movs*, so match that too.
   3513   let Predicates = [UseSSE41] in {
   3514     def : Pat<(VT (X86Blendi VT:$dst, (Intr VT:$src), (i8 1))),
   3515               (!cast<I>(OpcPrefix#r_Int) VT:$dst, VT:$src)>;
   3516   }
   3517 
   3518   // Repeat for AVX versions of the instructions.
   3519   let Predicates = [HasAVX] in {
   3520     def : Pat<(VT (Move VT:$dst, (Intr VT:$src))),
   3521               (!cast<I>("V"#OpcPrefix#r_Int) VT:$dst, VT:$src)>;
   3522 
   3523     def : Pat<(VT (X86Blendi VT:$dst, (Intr VT:$src), (i8 1))),
   3524               (!cast<I>("V"#OpcPrefix#r_Int) VT:$dst, VT:$src)>;
   3525   }
   3526 }
   3527 
   3528 defm : scalar_unary_math_patterns<int_x86_sse_rcp_ss, "RCPSS", X86Movss,
   3529                                   v4f32, UseSSE1>;
   3530 defm : scalar_unary_math_patterns<int_x86_sse_rsqrt_ss, "RSQRTSS", X86Movss,
   3531                                   v4f32, UseSSE1>;
   3532 defm : scalar_unary_math_patterns<int_x86_sse_sqrt_ss, "SQRTSS", X86Movss,
   3533                                   v4f32, UseSSE1>;
   3534 defm : scalar_unary_math_patterns<int_x86_sse2_sqrt_sd, "SQRTSD", X86Movsd,
   3535                                   v2f64, UseSSE2>;
   3536 
   3537 
   3538 //===----------------------------------------------------------------------===//
   3539 // SSE 1 & 2 - Non-temporal stores
   3540 //===----------------------------------------------------------------------===//
   3541 
   3542 let AddedComplexity = 400 in { // Prefer non-temporal versions
   3543 let SchedRW = [WriteStore] in {
   3544 let Predicates = [HasAVX, NoVLX] in {
   3545 def VMOVNTPSmr : VPSI<0x2B, MRMDestMem, (outs),
   3546                      (ins f128mem:$dst, VR128:$src),
   3547                      "movntps\t{$src, $dst|$dst, $src}",
   3548                      [(alignednontemporalstore (v4f32 VR128:$src),
   3549                                                addr:$dst)],
   3550                                                IIC_SSE_MOVNT>, VEX;
   3551 def VMOVNTPDmr : VPDI<0x2B, MRMDestMem, (outs),
   3552                      (ins f128mem:$dst, VR128:$src),
   3553                      "movntpd\t{$src, $dst|$dst, $src}",
   3554                      [(alignednontemporalstore (v2f64 VR128:$src),
   3555                                                addr:$dst)],
   3556                                                IIC_SSE_MOVNT>, VEX;
   3557 
   3558 let ExeDomain = SSEPackedInt in
   3559 def VMOVNTDQmr    : VPDI<0xE7, MRMDestMem, (outs),
   3560                          (ins f128mem:$dst, VR128:$src),
   3561                          "movntdq\t{$src, $dst|$dst, $src}",
   3562                          [(alignednontemporalstore (v2i64 VR128:$src),
   3563                                                    addr:$dst)],
   3564                                                    IIC_SSE_MOVNT>, VEX;
   3565 
   3566 def VMOVNTPSYmr : VPSI<0x2B, MRMDestMem, (outs),
   3567                      (ins f256mem:$dst, VR256:$src),
   3568                      "movntps\t{$src, $dst|$dst, $src}",
   3569                      [(alignednontemporalstore (v8f32 VR256:$src),
   3570                                                addr:$dst)],
   3571                                                IIC_SSE_MOVNT>, VEX, VEX_L;
   3572 def VMOVNTPDYmr : VPDI<0x2B, MRMDestMem, (outs),
   3573                      (ins f256mem:$dst, VR256:$src),
   3574                      "movntpd\t{$src, $dst|$dst, $src}",
   3575                      [(alignednontemporalstore (v4f64 VR256:$src),
   3576                                                addr:$dst)],
   3577                                                IIC_SSE_MOVNT>, VEX, VEX_L;
   3578 let ExeDomain = SSEPackedInt in
   3579 def VMOVNTDQYmr : VPDI<0xE7, MRMDestMem, (outs),
   3580                     (ins f256mem:$dst, VR256:$src),
   3581                     "movntdq\t{$src, $dst|$dst, $src}",
   3582                     [(alignednontemporalstore (v4i64 VR256:$src),
   3583                                               addr:$dst)],
   3584                                               IIC_SSE_MOVNT>, VEX, VEX_L;
   3585 }
   3586 
   3587 def MOVNTPSmr : PSI<0x2B, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
   3588                     "movntps\t{$src, $dst|$dst, $src}",
   3589                     [(alignednontemporalstore (v4f32 VR128:$src), addr:$dst)],
   3590                     IIC_SSE_MOVNT>;
   3591 def MOVNTPDmr : PDI<0x2B, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
   3592                     "movntpd\t{$src, $dst|$dst, $src}",
   3593                     [(alignednontemporalstore(v2f64 VR128:$src), addr:$dst)],
   3594                     IIC_SSE_MOVNT>;
   3595 
   3596 let ExeDomain = SSEPackedInt in
   3597 def MOVNTDQmr : PDI<0xE7, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
   3598                     "movntdq\t{$src, $dst|$dst, $src}",
   3599                     [(alignednontemporalstore (v2i64 VR128:$src), addr:$dst)],
   3600                     IIC_SSE_MOVNT>;
   3601 
   3602 // There is no AVX form for instructions below this point
   3603 def MOVNTImr : I<0xC3, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src),
   3604                  "movnti{l}\t{$src, $dst|$dst, $src}",
   3605                  [(nontemporalstore (i32 GR32:$src), addr:$dst)],
   3606                  IIC_SSE_MOVNT>,
   3607                PS, Requires<[HasSSE2]>;
   3608 def MOVNTI_64mr : RI<0xC3, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src),
   3609                      "movnti{q}\t{$src, $dst|$dst, $src}",
   3610                      [(nontemporalstore (i64 GR64:$src), addr:$dst)],
   3611                      IIC_SSE_MOVNT>,
   3612                   PS, Requires<[HasSSE2]>;
   3613 } // SchedRW = [WriteStore]
   3614 
   3615 let Predicates = [HasAVX, NoVLX] in {
   3616   def : Pat<(alignednontemporalstore (v8i32 VR256:$src), addr:$dst),
   3617             (VMOVNTDQYmr addr:$dst, VR256:$src)>;
   3618   def : Pat<(alignednontemporalstore (v16i16 VR256:$src), addr:$dst),
   3619             (VMOVNTDQYmr addr:$dst, VR256:$src)>;
   3620   def : Pat<(alignednontemporalstore (v32i8 VR256:$src), addr:$dst),
   3621             (VMOVNTDQYmr addr:$dst, VR256:$src)>;
   3622 
   3623   def : Pat<(alignednontemporalstore (v4i32 VR128:$src), addr:$dst),
   3624             (VMOVNTDQmr addr:$dst, VR128:$src)>;
   3625   def : Pat<(alignednontemporalstore (v8i16 VR128:$src), addr:$dst),
   3626             (VMOVNTDQmr addr:$dst, VR128:$src)>;
   3627   def : Pat<(alignednontemporalstore (v16i8 VR128:$src), addr:$dst),
   3628             (VMOVNTDQmr addr:$dst, VR128:$src)>;
   3629 }
   3630 
   3631 let Predicates = [UseSSE2] in {
   3632   def : Pat<(alignednontemporalstore (v4i32 VR128:$src), addr:$dst),
   3633             (MOVNTDQmr addr:$dst, VR128:$src)>;
   3634   def : Pat<(alignednontemporalstore (v8i16 VR128:$src), addr:$dst),
   3635             (MOVNTDQmr addr:$dst, VR128:$src)>;
   3636   def : Pat<(alignednontemporalstore (v16i8 VR128:$src), addr:$dst),
   3637             (MOVNTDQmr addr:$dst, VR128:$src)>;
   3638 }
   3639 
   3640 } // AddedComplexity
   3641 
   3642 //===----------------------------------------------------------------------===//
   3643 // SSE 1 & 2 - Prefetch and memory fence
   3644 //===----------------------------------------------------------------------===//
   3645 
   3646 // Prefetch intrinsic.
   3647 let Predicates = [HasSSE1], SchedRW = [WriteLoad] in {
   3648 def PREFETCHT0   : I<0x18, MRM1m, (outs), (ins i8mem:$src),
   3649     "prefetcht0\t$src", [(prefetch addr:$src, imm, (i32 3), (i32 1))],
   3650     IIC_SSE_PREFETCH>, TB;
   3651 def PREFETCHT1   : I<0x18, MRM2m, (outs), (ins i8mem:$src),
   3652     "prefetcht1\t$src", [(prefetch addr:$src, imm, (i32 2), (i32 1))],
   3653     IIC_SSE_PREFETCH>, TB;
   3654 def PREFETCHT2   : I<0x18, MRM3m, (outs), (ins i8mem:$src),
   3655     "prefetcht2\t$src", [(prefetch addr:$src, imm, (i32 1), (i32 1))],
   3656     IIC_SSE_PREFETCH>, TB;
   3657 def PREFETCHNTA  : I<0x18, MRM0m, (outs), (ins i8mem:$src),
   3658     "prefetchnta\t$src", [(prefetch addr:$src, imm, (i32 0), (i32 1))],
   3659     IIC_SSE_PREFETCH>, TB;
   3660 }
   3661 
   3662 // FIXME: How should flush instruction be modeled?
   3663 let SchedRW = [WriteLoad] in {
   3664 // Flush cache
   3665 def CLFLUSH : I<0xAE, MRM7m, (outs), (ins i8mem:$src),
   3666                "clflush\t$src", [(int_x86_sse2_clflush addr:$src)],
   3667                IIC_SSE_PREFETCH>, PS, Requires<[HasSSE2]>;
   3668 }
   3669 
   3670 let SchedRW = [WriteNop] in {
   3671 // Pause. This "instruction" is encoded as "rep; nop", so even though it
   3672 // was introduced with SSE2, it's backward compatible.
   3673 def PAUSE : I<0x90, RawFrm, (outs), (ins),
   3674               "pause", [(int_x86_sse2_pause)], IIC_SSE_PAUSE>,
   3675               OBXS, Requires<[HasSSE2]>;
   3676 }
   3677 
   3678 let SchedRW = [WriteFence] in {
   3679 // Load, store, and memory fence
   3680 // TODO: As with mfence, we may want to ease the availablity of sfence/lfence
   3681 // to include any 64-bit target.
   3682 def SFENCE : I<0xAE, MRM_F8, (outs), (ins),
   3683                "sfence", [(int_x86_sse_sfence)], IIC_SSE_SFENCE>,
   3684                PS, Requires<[HasSSE1]>;
   3685 def LFENCE : I<0xAE, MRM_E8, (outs), (ins),
   3686                "lfence", [(int_x86_sse2_lfence)], IIC_SSE_LFENCE>,
   3687                TB, Requires<[HasSSE2]>;
   3688 def MFENCE : I<0xAE, MRM_F0, (outs), (ins),
   3689                "mfence", [(int_x86_sse2_mfence)], IIC_SSE_MFENCE>,
   3690                TB, Requires<[HasMFence]>;
   3691 } // SchedRW
   3692 
   3693 def : Pat<(X86MFence), (MFENCE)>;
   3694 
   3695 //===----------------------------------------------------------------------===//
   3696 // SSE 1 & 2 - Load/Store XCSR register
   3697 //===----------------------------------------------------------------------===//
   3698 
   3699 def VLDMXCSR : VPSI<0xAE, MRM2m, (outs), (ins i32mem:$src),
   3700                   "ldmxcsr\t$src", [(int_x86_sse_ldmxcsr addr:$src)],
   3701                   IIC_SSE_LDMXCSR>, VEX, Sched<[WriteLoad]>;
   3702 def VSTMXCSR : VPSI<0xAE, MRM3m, (outs), (ins i32mem:$dst),
   3703                   "stmxcsr\t$dst", [(int_x86_sse_stmxcsr addr:$dst)],
   3704                   IIC_SSE_STMXCSR>, VEX, Sched<[WriteStore]>;
   3705 
   3706 let Predicates = [UseSSE1] in {
   3707 def LDMXCSR : I<0xAE, MRM2m, (outs), (ins i32mem:$src),
   3708                 "ldmxcsr\t$src", [(int_x86_sse_ldmxcsr addr:$src)],
   3709                 IIC_SSE_LDMXCSR>, TB, Sched<[WriteLoad]>;
   3710 def STMXCSR : I<0xAE, MRM3m, (outs), (ins i32mem:$dst),
   3711                 "stmxcsr\t$dst", [(int_x86_sse_stmxcsr addr:$dst)],
   3712                 IIC_SSE_STMXCSR>, TB, Sched<[WriteStore]>;
   3713 }
   3714 
   3715 //===---------------------------------------------------------------------===//
   3716 // SSE2 - Move Aligned/Unaligned Packed Integer Instructions
   3717 //===---------------------------------------------------------------------===//
   3718 
   3719 let ExeDomain = SSEPackedInt in { // SSE integer instructions
   3720 
   3721 let hasSideEffects = 0, SchedRW = [WriteMove] in {
   3722 def VMOVDQArr  : VPDI<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
   3723                     "movdqa\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVA_P_RR>,
   3724                     VEX;
   3725 def VMOVDQAYrr : VPDI<0x6F, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
   3726                     "movdqa\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVA_P_RR>,
   3727                     VEX, VEX_L;
   3728 def VMOVDQUrr  : VSSI<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
   3729                     "movdqu\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVU_P_RR>,
   3730                     VEX;
   3731 def VMOVDQUYrr : VSSI<0x6F, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
   3732                     "movdqu\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVU_P_RR>,
   3733                     VEX, VEX_L;
   3734 }
   3735 
   3736 // For Disassembler
   3737 let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0,
   3738     SchedRW = [WriteMove] in {
   3739 def VMOVDQArr_REV  : VPDI<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
   3740                         "movdqa\t{$src, $dst|$dst, $src}", [],
   3741                         IIC_SSE_MOVA_P_RR>,
   3742                         VEX;
   3743 def VMOVDQAYrr_REV : VPDI<0x7F, MRMDestReg, (outs VR256:$dst), (ins VR256:$src),
   3744                         "movdqa\t{$src, $dst|$dst, $src}", [],
   3745                         IIC_SSE_MOVA_P_RR>, VEX, VEX_L;
   3746 def VMOVDQUrr_REV  : VSSI<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
   3747                         "movdqu\t{$src, $dst|$dst, $src}", [],
   3748                         IIC_SSE_MOVU_P_RR>,
   3749                         VEX;
   3750 def VMOVDQUYrr_REV : VSSI<0x7F, MRMDestReg, (outs VR256:$dst), (ins VR256:$src),
   3751                         "movdqu\t{$src, $dst|$dst, $src}", [],
   3752                         IIC_SSE_MOVU_P_RR>, VEX, VEX_L;
   3753 }
   3754 
   3755 let canFoldAsLoad = 1, mayLoad = 1, isReMaterializable = 1,
   3756     hasSideEffects = 0, SchedRW = [WriteLoad] in {
   3757 def VMOVDQArm  : VPDI<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
   3758                    "movdqa\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVA_P_RM>,
   3759                    VEX;
   3760 def VMOVDQAYrm : VPDI<0x6F, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src),
   3761                    "movdqa\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVA_P_RM>,
   3762                    VEX, VEX_L;
   3763 let Predicates = [HasAVX] in {
   3764   def VMOVDQUrm  : I<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
   3765                     "vmovdqu\t{$src, $dst|$dst, $src}",[], IIC_SSE_MOVU_P_RM>,
   3766                     XS, VEX;
   3767   def VMOVDQUYrm : I<0x6F, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src),
   3768                     "vmovdqu\t{$src, $dst|$dst, $src}",[], IIC_SSE_MOVU_P_RM>,
   3769                     XS, VEX, VEX_L;
   3770 }
   3771 }
   3772 
   3773 let mayStore = 1, hasSideEffects = 0, SchedRW = [WriteStore] in {
   3774 def VMOVDQAmr  : VPDI<0x7F, MRMDestMem, (outs),
   3775                      (ins i128mem:$dst, VR128:$src),
   3776                      "movdqa\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVA_P_MR>,
   3777                      VEX;
   3778 def VMOVDQAYmr : VPDI<0x7F, MRMDestMem, (outs),
   3779                      (ins i256mem:$dst, VR256:$src),
   3780                      "movdqa\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVA_P_MR>,
   3781                      VEX, VEX_L;
   3782 let Predicates = [HasAVX] in {
   3783 def VMOVDQUmr  : I<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src),
   3784                   "vmovdqu\t{$src, $dst|$dst, $src}",[], IIC_SSE_MOVU_P_MR>,
   3785                   XS, VEX;
   3786 def VMOVDQUYmr : I<0x7F, MRMDestMem, (outs), (ins i256mem:$dst, VR256:$src),
   3787                   "vmovdqu\t{$src, $dst|$dst, $src}",[], IIC_SSE_MOVU_P_MR>,
   3788                   XS, VEX, VEX_L;
   3789 }
   3790 }
   3791 
   3792 let SchedRW = [WriteMove] in {
   3793 let hasSideEffects = 0 in
   3794 def MOVDQArr : PDI<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
   3795                    "movdqa\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVA_P_RR>;
   3796 
   3797 def MOVDQUrr :   I<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
   3798                    "movdqu\t{$src, $dst|$dst, $src}",
   3799                    [], IIC_SSE_MOVU_P_RR>, XS, Requires<[UseSSE2]>;
   3800 
   3801 // For Disassembler
   3802 let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in {
   3803 def MOVDQArr_REV : PDI<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
   3804                        "movdqa\t{$src, $dst|$dst, $src}", [],
   3805                        IIC_SSE_MOVA_P_RR>;
   3806 
   3807 def MOVDQUrr_REV :   I<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
   3808                        "movdqu\t{$src, $dst|$dst, $src}",
   3809                        [], IIC_SSE_MOVU_P_RR>, XS, Requires<[UseSSE2]>;
   3810 }
   3811 } // SchedRW
   3812 
   3813 let canFoldAsLoad = 1, mayLoad = 1, isReMaterializable = 1,
   3814     hasSideEffects = 0, SchedRW = [WriteLoad] in {
   3815 def MOVDQArm : PDI<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
   3816                    "movdqa\t{$src, $dst|$dst, $src}",
   3817                    [/*(set VR128:$dst, (alignedloadv2i64 addr:$src))*/],
   3818                    IIC_SSE_MOVA_P_RM>;
   3819 def MOVDQUrm :   I<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
   3820                    "movdqu\t{$src, $dst|$dst, $src}",
   3821                    [/*(set VR128:$dst, (loadv2i64 addr:$src))*/],
   3822                    IIC_SSE_MOVU_P_RM>,
   3823                  XS, Requires<[UseSSE2]>;
   3824 }
   3825 
   3826 let mayStore = 1, hasSideEffects = 0, SchedRW = [WriteStore] in {
   3827 def MOVDQAmr : PDI<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src),
   3828                    "movdqa\t{$src, $dst|$dst, $src}",
   3829                    [/*(alignedstore (v2i64 VR128:$src), addr:$dst)*/],
   3830                    IIC_SSE_MOVA_P_MR>;
   3831 def MOVDQUmr :   I<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src),
   3832                    "movdqu\t{$src, $dst|$dst, $src}",
   3833                    [/*(store (v2i64 VR128:$src), addr:$dst)*/],
   3834                    IIC_SSE_MOVU_P_MR>,
   3835                  XS, Requires<[UseSSE2]>;
   3836 }
   3837 
   3838 } // ExeDomain = SSEPackedInt
   3839 
   3840 // Aliases to help the assembler pick two byte VEX encodings by swapping the
   3841 // operands relative to the normal instructions to use VEX.R instead of VEX.B.
   3842 def : InstAlias<"vmovdqa\t{$src, $dst|$dst, $src}",
   3843                 (VMOVDQArr_REV VR128L:$dst, VR128H:$src), 0>;
   3844 def : InstAlias<"vmovdqa\t{$src, $dst|$dst, $src}",
   3845                 (VMOVDQAYrr_REV VR256L:$dst, VR256H:$src), 0>;
   3846 def : InstAlias<"vmovdqu\t{$src, $dst|$dst, $src}",
   3847                 (VMOVDQUrr_REV VR128L:$dst, VR128H:$src), 0>;
   3848 def : InstAlias<"vmovdqu\t{$src, $dst|$dst, $src}",
   3849                 (VMOVDQUYrr_REV VR256L:$dst, VR256H:$src), 0>;
   3850 
   3851 //===---------------------------------------------------------------------===//
   3852 // SSE2 - Packed Integer Arithmetic Instructions
   3853 //===---------------------------------------------------------------------===//
   3854 
   3855 let Sched = WriteVecIMul in
   3856 def SSE_PMADD : OpndItins<
   3857   IIC_SSE_PMADD, IIC_SSE_PMADD
   3858 >;
   3859 
   3860 let ExeDomain = SSEPackedInt in { // SSE integer instructions
   3861 
   3862 multiclass PDI_binop_rm_int<bits<8> opc, string OpcodeStr, Intrinsic IntId,
   3863                             RegisterClass RC, PatFrag memop_frag,
   3864                             X86MemOperand x86memop,
   3865                             OpndItins itins,
   3866                             bit IsCommutable = 0,
   3867                             bit Is2Addr = 1> {
   3868   let isCommutable = IsCommutable in
   3869   def rr : PDI<opc, MRMSrcReg, (outs RC:$dst),
   3870        (ins RC:$src1, RC:$src2),
   3871        !if(Is2Addr,
   3872            !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
   3873            !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
   3874        [(set RC:$dst, (IntId RC:$src1, RC:$src2))], itins.rr>,
   3875       Sched<[itins.Sched]>;
   3876   def rm : PDI<opc, MRMSrcMem, (outs RC:$dst),
   3877        (ins RC:$src1, x86memop:$src2),
   3878        !if(Is2Addr,
   3879            !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
   3880            !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
   3881        [(set RC:$dst, (IntId RC:$src1, (bitconvert (memop_frag addr:$src2))))],
   3882        itins.rm>, Sched<[itins.Sched.Folded, ReadAfterLd]>;
   3883 }
   3884 
   3885 multiclass PDI_binop_all_int<bits<8> opc, string OpcodeStr, Intrinsic IntId128,
   3886                              Intrinsic IntId256, OpndItins itins,
   3887                              bit IsCommutable = 0> {
   3888 let Predicates = [HasAVX] in
   3889   defm V#NAME : PDI_binop_rm_int<opc, !strconcat("v", OpcodeStr), IntId128,
   3890                                  VR128, loadv2i64, i128mem, itins,
   3891                                  IsCommutable, 0>, VEX_4V;
   3892 
   3893 let Constraints = "$src1 = $dst" in
   3894   defm NAME : PDI_binop_rm_int<opc, OpcodeStr, IntId128, VR128, memopv2i64,
   3895                                i128mem, itins, IsCommutable, 1>;
   3896 
   3897 let Predicates = [HasAVX2] in
   3898   defm V#NAME#Y : PDI_binop_rm_int<opc, !strconcat("v", OpcodeStr), IntId256,
   3899                                    VR256, loadv4i64, i256mem, itins,
   3900                                    IsCommutable, 0>, VEX_4V, VEX_L;
   3901 }
   3902 
   3903 multiclass PDI_binop_rmi<bits<8> opc, bits<8> opc2, Format ImmForm,
   3904                          string OpcodeStr, SDNode OpNode,
   3905                          SDNode OpNode2, RegisterClass RC,
   3906                          ValueType DstVT, ValueType SrcVT,
   3907                          PatFrag ld_frag, ShiftOpndItins itins,
   3908                          bit Is2Addr = 1> {
   3909   // src2 is always 128-bit
   3910   def rr : PDI<opc, MRMSrcReg, (outs RC:$dst),
   3911        (ins RC:$src1, VR128:$src2),
   3912        !if(Is2Addr,
   3913            !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
   3914            !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
   3915        [(set RC:$dst, (DstVT (OpNode RC:$src1, (SrcVT VR128:$src2))))],
   3916         itins.rr>, Sched<[WriteVecShift]>;
   3917   def rm : PDI<opc, MRMSrcMem, (outs RC:$dst),
   3918        (ins RC:$src1, i128mem:$src2),
   3919        !if(Is2Addr,
   3920            !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
   3921            !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
   3922        [(set RC:$dst, (DstVT (OpNode RC:$src1,
   3923                        (SrcVT (bitconvert (ld_frag addr:$src2))))))], itins.rm>,
   3924       Sched<[WriteVecShiftLd, ReadAfterLd]>;
   3925   def ri : PDIi8<opc2, ImmForm, (outs RC:$dst),
   3926        (ins RC:$src1, u8imm:$src2),
   3927        !if(Is2Addr,
   3928            !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
   3929            !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
   3930        [(set RC:$dst, (DstVT (OpNode2 RC:$src1, (i8 imm:$src2))))], itins.ri>,
   3931        Sched<[WriteVecShift]>;
   3932 }
   3933 
   3934 /// PDI_binop_rm2 - Simple SSE2 binary operator with different src and dst types
   3935 multiclass PDI_binop_rm2<bits<8> opc, string OpcodeStr, SDNode OpNode,
   3936                          ValueType DstVT, ValueType SrcVT, RegisterClass RC,
   3937                          PatFrag memop_frag, X86MemOperand x86memop,
   3938                          OpndItins itins,
   3939                          bit IsCommutable = 0, bit Is2Addr = 1> {
   3940   let isCommutable = IsCommutable in
   3941   def rr : PDI<opc, MRMSrcReg, (outs RC:$dst),
   3942        (ins RC:$src1, RC:$src2),
   3943        !if(Is2Addr,
   3944            !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
   3945            !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
   3946        [(set RC:$dst, (DstVT (OpNode (SrcVT RC:$src1), RC:$src2)))]>,
   3947        Sched<[itins.Sched]>;
   3948   def rm : PDI<opc, MRMSrcMem, (outs RC:$dst),
   3949        (ins RC:$src1, x86memop:$src2),
   3950        !if(Is2Addr,
   3951            !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
   3952            !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
   3953        [(set RC:$dst, (DstVT (OpNode (SrcVT RC:$src1),
   3954                                      (bitconvert (memop_frag addr:$src2)))))]>,
   3955        Sched<[itins.Sched.Folded, ReadAfterLd]>;
   3956 }
   3957 } // ExeDomain = SSEPackedInt
   3958 
   3959 defm PADDB   : PDI_binop_all<0xFC, "paddb", add, v16i8, v32i8,
   3960                              SSE_INTALU_ITINS_P, 1, NoVLX_Or_NoBWI>;
   3961 defm PADDW   : PDI_binop_all<0xFD, "paddw", add, v8i16, v16i16,
   3962                              SSE_INTALU_ITINS_P, 1, NoVLX_Or_NoBWI>;
   3963 defm PADDD   : PDI_binop_all<0xFE, "paddd", add, v4i32, v8i32,
   3964                              SSE_INTALU_ITINS_P, 1, NoVLX>;
   3965 defm PADDQ   : PDI_binop_all<0xD4, "paddq", add, v2i64, v4i64,
   3966                              SSE_INTALUQ_ITINS_P, 1, NoVLX>;
   3967 defm PADDSB  : PDI_binop_all<0xEC, "paddsb", X86adds, v16i8, v32i8,
   3968                              SSE_INTALU_ITINS_P, 1, NoVLX_Or_NoBWI>;
   3969 defm PADDSW  : PDI_binop_all<0xED, "paddsw", X86adds, v8i16, v16i16,
   3970                              SSE_INTALU_ITINS_P, 1, NoVLX_Or_NoBWI>;
   3971 defm PADDUSB : PDI_binop_all<0xDC, "paddusb", X86addus, v16i8, v32i8,
   3972                              SSE_INTALU_ITINS_P, 0, NoVLX_Or_NoBWI>;
   3973 defm PADDUSW : PDI_binop_all<0xDD, "paddusw", X86addus, v8i16, v16i16,
   3974                              SSE_INTALU_ITINS_P, 0, NoVLX_Or_NoBWI>;
   3975 defm PMULLW  : PDI_binop_all<0xD5, "pmullw", mul, v8i16, v16i16,
   3976                              SSE_INTMUL_ITINS_P, 1, NoVLX_Or_NoBWI>;
   3977 defm PMULHUW : PDI_binop_all<0xE4, "pmulhuw", mulhu, v8i16, v16i16,
   3978                              SSE_INTMUL_ITINS_P, 1, NoVLX_Or_NoBWI>;
   3979 defm PMULHW  : PDI_binop_all<0xE5, "pmulhw", mulhs, v8i16, v16i16,
   3980                              SSE_INTMUL_ITINS_P, 1, NoVLX_Or_NoBWI>;
   3981 defm PSUBB   : PDI_binop_all<0xF8, "psubb", sub, v16i8, v32i8,
   3982                              SSE_INTALU_ITINS_P, 0, NoVLX_Or_NoBWI>;
   3983 defm PSUBW   : PDI_binop_all<0xF9, "psubw", sub, v8i16, v16i16,
   3984                              SSE_INTALU_ITINS_P, 0, NoVLX_Or_NoBWI>;
   3985 defm PSUBD   : PDI_binop_all<0xFA, "psubd", sub, v4i32, v8i32,
   3986                              SSE_INTALU_ITINS_P, 0, NoVLX>;
   3987 defm PSUBQ   : PDI_binop_all<0xFB, "psubq", sub, v2i64, v4i64,
   3988                              SSE_INTALUQ_ITINS_P, 0, NoVLX>;
   3989 defm PSUBSB  : PDI_binop_all<0xE8, "psubsb", X86subs, v16i8, v32i8,
   3990                              SSE_INTALU_ITINS_P, 0, NoVLX_Or_NoBWI>;
   3991 defm PSUBSW  : PDI_binop_all<0xE9, "psubsw", X86subs, v8i16, v16i16,
   3992                              SSE_INTALU_ITINS_P, 0, NoVLX_Or_NoBWI>;
   3993 defm PSUBUSB : PDI_binop_all<0xD8, "psubusb", X86subus, v16i8, v32i8,
   3994                              SSE_INTALU_ITINS_P, 0, NoVLX_Or_NoBWI>;
   3995 defm PSUBUSW : PDI_binop_all<0xD9, "psubusw", X86subus, v8i16, v16i16,
   3996                              SSE_INTALU_ITINS_P, 0, NoVLX_Or_NoBWI>;
   3997 defm PMINUB  : PDI_binop_all<0xDA, "pminub", umin, v16i8, v32i8,
   3998                              SSE_INTALU_ITINS_P, 1, NoVLX_Or_NoBWI>;
   3999 defm PMINSW  : PDI_binop_all<0xEA, "pminsw", smin, v8i16, v16i16,
   4000                              SSE_INTALU_ITINS_P, 1, NoVLX_Or_NoBWI>;
   4001 defm PMAXUB  : PDI_binop_all<0xDE, "pmaxub", umax, v16i8, v32i8,
   4002                              SSE_INTALU_ITINS_P, 1, NoVLX_Or_NoBWI>;
   4003 defm PMAXSW  : PDI_binop_all<0xEE, "pmaxsw", smax, v8i16, v16i16,
   4004                              SSE_INTALU_ITINS_P, 1, NoVLX_Or_NoBWI>;
   4005 defm PAVGB   : PDI_binop_all<0xE0, "pavgb", X86avg, v16i8, v32i8,
   4006                              SSE_INTALU_ITINS_P, 1, NoVLX_Or_NoBWI>;
   4007 defm PAVGW   : PDI_binop_all<0xE3, "pavgw", X86avg, v8i16, v16i16,
   4008                              SSE_INTALU_ITINS_P, 1, NoVLX_Or_NoBWI>;
   4009 
   4010 // Intrinsic forms
   4011 defm PMADDWD : PDI_binop_all_int<0xF5, "pmaddwd", int_x86_sse2_pmadd_wd,
   4012                                  int_x86_avx2_pmadd_wd, SSE_PMADD, 1>;
   4013 
   4014 let Predicates = [HasAVX, NoVLX_Or_NoBWI] in
   4015 defm VPSADBW : PDI_binop_rm2<0xF6, "vpsadbw", X86psadbw, v2i64, v16i8, VR128,
   4016                              loadv2i64, i128mem, SSE_INTMUL_ITINS_P, 1, 0>,
   4017                              VEX_4V;
   4018 let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in
   4019 defm VPSADBWY : PDI_binop_rm2<0xF6, "vpsadbw", X86psadbw, v4i64, v32i8, VR256,
   4020                              loadv4i64, i256mem, SSE_INTMUL_ITINS_P, 1, 0>,
   4021                              VEX_4V, VEX_L;
   4022 let Constraints = "$src1 = $dst" in
   4023 defm PSADBW : PDI_binop_rm2<0xF6, "psadbw", X86psadbw, v2i64, v16i8, VR128,
   4024                             memopv2i64, i128mem, SSE_INTALU_ITINS_P, 1>;
   4025 
   4026 let Predicates = [HasAVX, NoVLX] in
   4027 defm VPMULUDQ : PDI_binop_rm2<0xF4, "vpmuludq", X86pmuludq, v2i64, v4i32, VR128,
   4028                               loadv2i64, i128mem, SSE_INTMUL_ITINS_P, 1, 0>,
   4029                               VEX_4V;
   4030 let Predicates = [HasAVX2, NoVLX] in
   4031 defm VPMULUDQY : PDI_binop_rm2<0xF4, "vpmuludq", X86pmuludq, v4i64, v8i32,
   4032                                VR256, loadv4i64, i256mem,
   4033                                SSE_INTMUL_ITINS_P, 1, 0>, VEX_4V, VEX_L;
   4034 let Constraints = "$src1 = $dst" in
   4035 defm PMULUDQ : PDI_binop_rm2<0xF4, "pmuludq", X86pmuludq, v2i64, v4i32, VR128,
   4036                              memopv2i64, i128mem, SSE_INTMUL_ITINS_P, 1>;
   4037 
   4038 //===---------------------------------------------------------------------===//
   4039 // SSE2 - Packed Integer Logical Instructions
   4040 //===---------------------------------------------------------------------===//
   4041 
   4042 let Predicates = [HasAVX, NoVLX] in {
   4043 defm VPSLLD : PDI_binop_rmi<0xF2, 0x72, MRM6r, "vpslld", X86vshl, X86vshli,
   4044                             VR128, v4i32, v4i32, loadv2i64,
   4045                             SSE_INTSHIFT_ITINS_P, 0>, VEX_4V;
   4046 defm VPSLLQ : PDI_binop_rmi<0xF3, 0x73, MRM6r, "vpsllq", X86vshl, X86vshli,
   4047                             VR128, v2i64, v2i64, loadv2i64,
   4048                             SSE_INTSHIFT_ITINS_P, 0>, VEX_4V;
   4049 
   4050 defm VPSRLD : PDI_binop_rmi<0xD2, 0x72, MRM2r, "vpsrld", X86vsrl, X86vsrli,
   4051                             VR128, v4i32, v4i32, loadv2i64,
   4052                             SSE_INTSHIFT_ITINS_P, 0>, VEX_4V;
   4053 defm VPSRLQ : PDI_binop_rmi<0xD3, 0x73, MRM2r, "vpsrlq", X86vsrl, X86vsrli,
   4054                             VR128, v2i64, v2i64, loadv2i64,
   4055                             SSE_INTSHIFT_ITINS_P, 0>, VEX_4V;
   4056 
   4057 defm VPSRAD : PDI_binop_rmi<0xE2, 0x72, MRM4r, "vpsrad", X86vsra, X86vsrai,
   4058                             VR128, v4i32, v4i32, loadv2i64,
   4059                             SSE_INTSHIFT_ITINS_P, 0>, VEX_4V;
   4060 } // Predicates = [HasAVX, NoVLX]
   4061 
   4062 let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
   4063 defm VPSLLW : PDI_binop_rmi<0xF1, 0x71, MRM6r, "vpsllw", X86vshl, X86vshli,
   4064                             VR128, v8i16, v8i16, loadv2i64,
   4065                             SSE_INTSHIFT_ITINS_P, 0>, VEX_4V;
   4066 defm VPSRLW : PDI_binop_rmi<0xD1, 0x71, MRM2r, "vpsrlw", X86vsrl, X86vsrli,
   4067                             VR128, v8i16, v8i16, loadv2i64,
   4068                             SSE_INTSHIFT_ITINS_P, 0>, VEX_4V;
   4069 defm VPSRAW : PDI_binop_rmi<0xE1, 0x71, MRM4r, "vpsraw", X86vsra, X86vsrai,
   4070                             VR128, v8i16, v8i16, loadv2i64,
   4071                             SSE_INTSHIFT_ITINS_P, 0>, VEX_4V;
   4072 } // Predicates = [HasAVX, NoVLX_Or_NoBWI]
   4073 
   4074 
   4075 let ExeDomain = SSEPackedInt, SchedRW = [WriteVecShift] ,
   4076                                     Predicates = [HasAVX, NoVLX_Or_NoBWI]in {
   4077   // 128-bit logical shifts.
   4078   def VPSLLDQri : PDIi8<0x73, MRM7r,
   4079                     (outs VR128:$dst), (ins VR128:$src1, u8imm:$src2),
   4080                     "vpslldq\t{$src2, $src1, $dst|$dst, $src1, $src2}",
   4081                     [(set VR128:$dst,
   4082                       (v16i8 (X86vshldq VR128:$src1, (i8 imm:$src2))))]>,
   4083                     VEX_4V;
   4084   def VPSRLDQri : PDIi8<0x73, MRM3r,
   4085                     (outs VR128:$dst), (ins VR128:$src1, u8imm:$src2),
   4086                     "vpsrldq\t{$src2, $src1, $dst|$dst, $src1, $src2}",
   4087                     [(set VR128:$dst,
   4088                       (v16i8 (X86vshrdq VR128:$src1, (i8 imm:$src2))))]>,
   4089                     VEX_4V;
   4090   // PSRADQri doesn't exist in SSE[1-3].
   4091 } // Predicates = [HasAVX, NoVLX_Or_NoBWI]
   4092 
   4093 let Predicates = [HasAVX2, NoVLX] in {
   4094 defm VPSLLDY : PDI_binop_rmi<0xF2, 0x72, MRM6r, "vpslld", X86vshl, X86vshli,
   4095                              VR256, v8i32, v4i32, loadv2i64,
   4096                              SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L;
   4097 defm VPSLLQY : PDI_binop_rmi<0xF3, 0x73, MRM6r, "vpsllq", X86vshl, X86vshli,
   4098                              VR256, v4i64, v2i64, loadv2i64,
   4099                              SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L;
   4100 
   4101 defm VPSRLDY : PDI_binop_rmi<0xD2, 0x72, MRM2r, "vpsrld", X86vsrl, X86vsrli,
   4102                              VR256, v8i32, v4i32, loadv2i64,
   4103                              SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L;
   4104 defm VPSRLQY : PDI_binop_rmi<0xD3, 0x73, MRM2r, "vpsrlq", X86vsrl, X86vsrli,
   4105                              VR256, v4i64, v2i64, loadv2i64,
   4106                              SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L;
   4107 
   4108 defm VPSRADY : PDI_binop_rmi<0xE2, 0x72, MRM4r, "vpsrad", X86vsra, X86vsrai,
   4109                              VR256, v8i32, v4i32, loadv2i64,
   4110                              SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L;
   4111 }// Predicates = [HasAVX2, NoVLX]
   4112 
   4113 let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
   4114 defm VPSLLWY : PDI_binop_rmi<0xF1, 0x71, MRM6r, "vpsllw", X86vshl, X86vshli,
   4115                              VR256, v16i16, v8i16, loadv2i64,
   4116                              SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L;
   4117 defm VPSRLWY : PDI_binop_rmi<0xD1, 0x71, MRM2r, "vpsrlw", X86vsrl, X86vsrli,
   4118                              VR256, v16i16, v8i16, loadv2i64,
   4119                              SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L;
   4120 defm VPSRAWY : PDI_binop_rmi<0xE1, 0x71, MRM4r, "vpsraw", X86vsra, X86vsrai,
   4121                              VR256, v16i16, v8i16, loadv2i64,
   4122                              SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L;
   4123 }// Predicates = [HasAVX2, NoVLX_Or_NoBWI]
   4124 
   4125 let ExeDomain = SSEPackedInt, SchedRW = [WriteVecShift], hasSideEffects = 0 ,
   4126                                     Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
   4127   // 256-bit logical shifts.
   4128   def VPSLLDQYri : PDIi8<0x73, MRM7r,
   4129                     (outs VR256:$dst), (ins VR256:$src1, u8imm:$src2),
   4130                     "vpslldq\t{$src2, $src1, $dst|$dst, $src1, $src2}",
   4131                     [(set VR256:$dst,
   4132                       (v32i8 (X86vshldq VR256:$src1, (i8 imm:$src2))))]>,
   4133                     VEX_4V, VEX_L;
   4134   def VPSRLDQYri : PDIi8<0x73, MRM3r,
   4135                     (outs VR256:$dst), (ins VR256:$src1, u8imm:$src2),
   4136                     "vpsrldq\t{$src2, $src1, $dst|$dst, $src1, $src2}",
   4137                     [(set VR256:$dst,
   4138                       (v32i8 (X86vshrdq VR256:$src1, (i8 imm:$src2))))]>,
   4139                     VEX_4V, VEX_L;
   4140   // PSRADQYri doesn't exist in SSE[1-3].
   4141 } // Predicates = [HasAVX2, NoVLX_Or_NoBWI]
   4142 
   4143 let Constraints = "$src1 = $dst" in {
   4144 defm PSLLW : PDI_binop_rmi<0xF1, 0x71, MRM6r, "psllw", X86vshl, X86vshli,
   4145                            VR128, v8i16, v8i16, memopv2i64,
   4146                            SSE_INTSHIFT_ITINS_P>;
   4147 defm PSLLD : PDI_binop_rmi<0xF2, 0x72, MRM6r, "pslld", X86vshl, X86vshli,
   4148                            VR128, v4i32, v4i32, memopv2i64,
   4149                            SSE_INTSHIFT_ITINS_P>;
   4150 defm PSLLQ : PDI_binop_rmi<0xF3, 0x73, MRM6r, "psllq", X86vshl, X86vshli,
   4151                            VR128, v2i64, v2i64, memopv2i64,
   4152                            SSE_INTSHIFT_ITINS_P>;
   4153 
   4154 defm PSRLW : PDI_binop_rmi<0xD1, 0x71, MRM2r, "psrlw", X86vsrl, X86vsrli,
   4155                            VR128, v8i16, v8i16, memopv2i64,
   4156                            SSE_INTSHIFT_ITINS_P>;
   4157 defm PSRLD : PDI_binop_rmi<0xD2, 0x72, MRM2r, "psrld", X86vsrl, X86vsrli,
   4158                            VR128, v4i32, v4i32, memopv2i64,
   4159                            SSE_INTSHIFT_ITINS_P>;
   4160 defm PSRLQ : PDI_binop_rmi<0xD3, 0x73, MRM2r, "psrlq", X86vsrl, X86vsrli,
   4161                            VR128, v2i64, v2i64, memopv2i64,
   4162                            SSE_INTSHIFT_ITINS_P>;
   4163 
   4164 defm PSRAW : PDI_binop_rmi<0xE1, 0x71, MRM4r, "psraw", X86vsra, X86vsrai,
   4165                            VR128, v8i16, v8i16, memopv2i64,
   4166                            SSE_INTSHIFT_ITINS_P>;
   4167 defm PSRAD : PDI_binop_rmi<0xE2, 0x72, MRM4r, "psrad", X86vsra, X86vsrai,
   4168                            VR128, v4i32, v4i32, memopv2i64,
   4169                            SSE_INTSHIFT_ITINS_P>;
   4170 
   4171 let ExeDomain = SSEPackedInt, SchedRW = [WriteVecShift], hasSideEffects = 0 in {
   4172   // 128-bit logical shifts.
   4173   def PSLLDQri : PDIi8<0x73, MRM7r,
   4174                        (outs VR128:$dst), (ins VR128:$src1, u8imm:$src2),
   4175                        "pslldq\t{$src2, $dst|$dst, $src2}",
   4176                        [(set VR128:$dst,
   4177                          (v16i8 (X86vshldq VR128:$src1, (i8 imm:$src2))))],
   4178                        IIC_SSE_INTSHDQ_P_RI>;
   4179   def PSRLDQri : PDIi8<0x73, MRM3r,
   4180                        (outs VR128:$dst), (ins VR128:$src1, u8imm:$src2),
   4181                        "psrldq\t{$src2, $dst|$dst, $src2}",
   4182                        [(set VR128:$dst,
   4183                          (v16i8 (X86vshrdq VR128:$src1, (i8 imm:$src2))))],
   4184                        IIC_SSE_INTSHDQ_P_RI>;
   4185   // PSRADQri doesn't exist in SSE[1-3].
   4186 }
   4187 } // Constraints = "$src1 = $dst"
   4188 
   4189 //===---------------------------------------------------------------------===//
   4190 // SSE2 - Packed Integer Comparison Instructions
   4191 //===---------------------------------------------------------------------===//
   4192 
   4193 defm PCMPEQB : PDI_binop_all<0x74, "pcmpeqb", X86pcmpeq, v16i8, v32i8,
   4194                              SSE_INTALU_ITINS_P, 1, TruePredicate>;
   4195 defm PCMPEQW : PDI_binop_all<0x75, "pcmpeqw", X86pcmpeq, v8i16, v16i16,
   4196                              SSE_INTALU_ITINS_P, 1, TruePredicate>;
   4197 defm PCMPEQD : PDI_binop_all<0x76, "pcmpeqd", X86pcmpeq, v4i32, v8i32,
   4198                              SSE_INTALU_ITINS_P, 1, TruePredicate>;
   4199 defm PCMPGTB : PDI_binop_all<0x64, "pcmpgtb", X86pcmpgt, v16i8, v32i8,
   4200                              SSE_INTALU_ITINS_P, 0, TruePredicate>;
   4201 defm PCMPGTW : PDI_binop_all<0x65, "pcmpgtw", X86pcmpgt, v8i16, v16i16,
   4202                              SSE_INTALU_ITINS_P, 0, TruePredicate>;
   4203 defm PCMPGTD : PDI_binop_all<0x66, "pcmpgtd", X86pcmpgt, v4i32, v8i32,
   4204                              SSE_INTALU_ITINS_P, 0, TruePredicate>;
   4205 
   4206 //===---------------------------------------------------------------------===//
   4207 // SSE2 - Packed Integer Shuffle Instructions
   4208 //===---------------------------------------------------------------------===//
   4209 
   4210 let ExeDomain = SSEPackedInt in {
   4211 multiclass sse2_pshuffle<string OpcodeStr, ValueType vt128, ValueType vt256,
   4212                          SDNode OpNode, Predicate prd> {
   4213 let Predicates = [HasAVX, prd] in {
   4214   def V#NAME#ri : Ii8<0x70, MRMSrcReg, (outs VR128:$dst),
   4215                       (ins VR128:$src1, u8imm:$src2),
   4216                       !strconcat("v", OpcodeStr,
   4217                                  "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
   4218                       [(set VR128:$dst,
   4219                         (vt128 (OpNode VR128:$src1, (i8 imm:$src2))))],
   4220                       IIC_SSE_PSHUF_RI>, VEX, Sched<[WriteShuffle]>;
   4221   def V#NAME#mi : Ii8<0x70, MRMSrcMem, (outs VR128:$dst),
   4222                       (ins i128mem:$src1, u8imm:$src2),
   4223                       !strconcat("v", OpcodeStr,
   4224                                  "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
   4225                      [(set VR128:$dst,
   4226                        (vt128 (OpNode (bitconvert (loadv2i64 addr:$src1)),
   4227                         (i8 imm:$src2))))], IIC_SSE_PSHUF_MI>, VEX,
   4228                   Sched<[WriteShuffleLd]>;
   4229 }
   4230 
   4231 let Predicates = [HasAVX2, prd] in {
   4232   def V#NAME#Yri : Ii8<0x70, MRMSrcReg, (outs VR256:$dst),
   4233                        (ins VR256:$src1, u8imm:$src2),
   4234                        !strconcat("v", OpcodeStr,
   4235                                   "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
   4236                        [(set VR256:$dst,
   4237                          (vt256 (OpNode VR256:$src1, (i8 imm:$src2))))],
   4238                        IIC_SSE_PSHUF_RI>, VEX, VEX_L, Sched<[WriteShuffle]>;
   4239   def V#NAME#Ymi : Ii8<0x70, MRMSrcMem, (outs VR256:$dst),
   4240                        (ins i256mem:$src1, u8imm:$src2),
   4241                        !strconcat("v", OpcodeStr,
   4242                                   "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
   4243                       [(set VR256:$dst,
   4244                         (vt256 (OpNode (bitconvert (loadv4i64 addr:$src1)),
   4245                          (i8 imm:$src2))))], IIC_SSE_PSHUF_MI>, VEX, VEX_L,
   4246                    Sched<[WriteShuffleLd]>;
   4247 }
   4248 
   4249 let Predicates = [UseSSE2] in {
   4250   def ri : Ii8<0x70, MRMSrcReg,
   4251                (outs VR128:$dst), (ins VR128:$src1, u8imm:$src2),
   4252                !strconcat(OpcodeStr,
   4253                           "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
   4254                 [(set VR128:$dst,
   4255                   (vt128 (OpNode VR128:$src1, (i8 imm:$src2))))],
   4256                 IIC_SSE_PSHUF_RI>, Sched<[WriteShuffle]>;
   4257   def mi : Ii8<0x70, MRMSrcMem,
   4258                (outs VR128:$dst), (ins i128mem:$src1, u8imm:$src2),
   4259                !strconcat(OpcodeStr,
   4260                           "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
   4261                 [(set VR128:$dst,
   4262                   (vt128 (OpNode (bitconvert (memopv2i64 addr:$src1)),
   4263                           (i8 imm:$src2))))], IIC_SSE_PSHUF_MI>,
   4264            Sched<[WriteShuffleLd, ReadAfterLd]>;
   4265 }
   4266 }
   4267 } // ExeDomain = SSEPackedInt
   4268 
   4269 defm PSHUFD  : sse2_pshuffle<"pshufd", v4i32, v8i32, X86PShufd, NoVLX>, PD;
   4270 defm PSHUFHW : sse2_pshuffle<"pshufhw", v8i16, v16i16, X86PShufhw,
   4271                              NoVLX_Or_NoBWI>, XS;
   4272 defm PSHUFLW : sse2_pshuffle<"pshuflw", v8i16, v16i16, X86PShuflw,
   4273                              NoVLX_Or_NoBWI>, XD;
   4274 
   4275 let Predicates = [HasAVX] in {
   4276   def : Pat<(v4f32 (X86PShufd (loadv4f32 addr:$src1), (i8 imm:$imm))),
   4277             (VPSHUFDmi addr:$src1, imm:$imm)>;
   4278   def : Pat<(v4f32 (X86PShufd VR128:$src1, (i8 imm:$imm))),
   4279             (VPSHUFDri VR128:$src1, imm:$imm)>;
   4280 }
   4281 
   4282 let Predicates = [UseSSE2] in {
   4283   def : Pat<(v4f32 (X86PShufd (memopv4f32 addr:$src1), (i8 imm:$imm))),
   4284             (PSHUFDmi addr:$src1, imm:$imm)>;
   4285   def : Pat<(v4f32 (X86PShufd VR128:$src1, (i8 imm:$imm))),
   4286             (PSHUFDri VR128:$src1, imm:$imm)>;
   4287 }
   4288 
   4289 //===---------------------------------------------------------------------===//
   4290 // Packed Integer Pack Instructions (SSE & AVX)
   4291 //===---------------------------------------------------------------------===//
   4292 
   4293 let ExeDomain = SSEPackedInt in {
   4294 multiclass sse2_pack<bits<8> opc, string OpcodeStr, ValueType OutVT,
   4295                      ValueType ArgVT, SDNode OpNode, PatFrag ld_frag,
   4296                      bit Is2Addr = 1> {
   4297   def rr : PDI<opc, MRMSrcReg,
   4298                (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
   4299                !if(Is2Addr,
   4300                    !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
   4301                    !strconcat(OpcodeStr,
   4302                               "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
   4303                [(set VR128:$dst,
   4304                      (OutVT (OpNode (ArgVT VR128:$src1), VR128:$src2)))]>,
   4305                Sched<[WriteShuffle]>;
   4306   def rm : PDI<opc, MRMSrcMem,
   4307                (outs VR128:$dst), (ins VR128:$src1, i128mem:$src2),
   4308                !if(Is2Addr,
   4309                    !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
   4310                    !strconcat(OpcodeStr,
   4311                               "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
   4312                [(set VR128:$dst,
   4313                      (OutVT (OpNode (ArgVT VR128:$src1),
   4314                                     (bitconvert (ld_frag addr:$src2)))))]>,
   4315                Sched<[WriteShuffleLd, ReadAfterLd]>;
   4316 }
   4317 
   4318 multiclass sse2_pack_y<bits<8> opc, string OpcodeStr, ValueType OutVT,
   4319                        ValueType ArgVT, SDNode OpNode> {
   4320   def Yrr : PDI<opc, MRMSrcReg,
   4321                 (outs VR256:$dst), (ins VR256:$src1, VR256:$src2),
   4322                 !strconcat(OpcodeStr,
   4323                            "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
   4324                 [(set VR256:$dst,
   4325                       (OutVT (OpNode (ArgVT VR256:$src1), VR256:$src2)))]>,
   4326                 Sched<[WriteShuffle]>;
   4327   def Yrm : PDI<opc, MRMSrcMem,
   4328                 (outs VR256:$dst), (ins VR256:$src1, i256mem:$src2),
   4329                 !strconcat(OpcodeStr,
   4330                            "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
   4331                 [(set VR256:$dst,
   4332                       (OutVT (OpNode (ArgVT VR256:$src1),
   4333                                      (bitconvert (loadv4i64 addr:$src2)))))]>,
   4334                 Sched<[WriteShuffleLd, ReadAfterLd]>;
   4335 }
   4336 
   4337 multiclass sse4_pack<bits<8> opc, string OpcodeStr, ValueType OutVT,
   4338                      ValueType ArgVT, SDNode OpNode, PatFrag ld_frag,
   4339                      bit Is2Addr = 1> {
   4340   def rr : SS48I<opc, MRMSrcReg,
   4341                  (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
   4342                  !if(Is2Addr,
   4343                      !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
   4344                      !strconcat(OpcodeStr,
   4345                                 "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
   4346                  [(set VR128:$dst,
   4347                        (OutVT (OpNode (ArgVT VR128:$src1), VR128:$src2)))]>,
   4348                  Sched<[WriteShuffle]>;
   4349   def rm : SS48I<opc, MRMSrcMem,
   4350                  (outs VR128:$dst), (ins VR128:$src1, i128mem:$src2),
   4351                  !if(Is2Addr,
   4352                      !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
   4353                      !strconcat(OpcodeStr,
   4354                                 "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
   4355                  [(set VR128:$dst,
   4356                        (OutVT (OpNode (ArgVT VR128:$src1),
   4357                                       (bitconvert (ld_frag addr:$src2)))))]>,
   4358                  Sched<[WriteShuffleLd, ReadAfterLd]>;
   4359 }
   4360 
   4361 multiclass sse4_pack_y<bits<8> opc, string OpcodeStr, ValueType OutVT,
   4362                      ValueType ArgVT, SDNode OpNode> {
   4363   def Yrr : SS48I<opc, MRMSrcReg,
   4364                   (outs VR256:$dst), (ins VR256:$src1, VR256:$src2),
   4365                   !strconcat(OpcodeStr,
   4366                              "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
   4367                   [(set VR256:$dst,
   4368                         (OutVT (OpNode (ArgVT VR256:$src1), VR256:$src2)))]>,
   4369                   Sched<[WriteShuffle]>;
   4370   def Yrm : SS48I<opc, MRMSrcMem,
   4371                   (outs VR256:$dst), (ins VR256:$src1, i256mem:$src2),
   4372                   !strconcat(OpcodeStr,
   4373                              "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
   4374                   [(set VR256:$dst,
   4375                         (OutVT (OpNode (ArgVT VR256:$src1),
   4376                                        (bitconvert (loadv4i64 addr:$src2)))))]>,
   4377                   Sched<[WriteShuffleLd, ReadAfterLd]>;
   4378 }
   4379 
   4380 let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
   4381   defm VPACKSSWB : sse2_pack<0x63, "vpacksswb", v16i8, v8i16, X86Packss,
   4382                              loadv2i64, 0>, VEX_4V;
   4383   defm VPACKSSDW : sse2_pack<0x6B, "vpackssdw", v8i16, v4i32, X86Packss,
   4384                              loadv2i64, 0>, VEX_4V;
   4385 
   4386   defm VPACKUSWB : sse2_pack<0x67, "vpackuswb", v16i8, v8i16, X86Packus,
   4387                              loadv2i64, 0>, VEX_4V;
   4388   defm VPACKUSDW : sse4_pack<0x2B, "vpackusdw", v8i16, v4i32, X86Packus,
   4389                              loadv2i64, 0>, VEX_4V;
   4390 }
   4391 
   4392 let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
   4393   defm VPACKSSWB : sse2_pack_y<0x63, "vpacksswb", v32i8, v16i16, X86Packss>,
   4394                                VEX_4V, VEX_L;
   4395   defm VPACKSSDW : sse2_pack_y<0x6B, "vpackssdw", v16i16, v8i32, X86Packss>,
   4396                                VEX_4V, VEX_L;
   4397 
   4398   defm VPACKUSWB : sse2_pack_y<0x67, "vpackuswb", v32i8, v16i16, X86Packus>,
   4399                                VEX_4V, VEX_L;
   4400   defm VPACKUSDW : sse4_pack_y<0x2B, "vpackusdw", v16i16, v8i32, X86Packus>,
   4401                                VEX_4V, VEX_L;
   4402 }
   4403 
   4404 let Constraints = "$src1 = $dst" in {
   4405   defm PACKSSWB : sse2_pack<0x63, "packsswb", v16i8, v8i16, X86Packss,
   4406                             memopv2i64>;
   4407   defm PACKSSDW : sse2_pack<0x6B, "packssdw", v8i16, v4i32, X86Packss,
   4408                             memopv2i64>;
   4409 
   4410   defm PACKUSWB : sse2_pack<0x67, "packuswb", v16i8, v8i16, X86Packus,
   4411                             memopv2i64>;
   4412 
   4413   defm PACKUSDW : sse4_pack<0x2B, "packusdw", v8i16, v4i32, X86Packus,
   4414                             memopv2i64>;
   4415 }
   4416 } // ExeDomain = SSEPackedInt
   4417 
   4418 //===---------------------------------------------------------------------===//
   4419 // SSE2 - Packed Integer Unpack Instructions
   4420 //===---------------------------------------------------------------------===//
   4421 
   4422 let ExeDomain = SSEPackedInt in {
   4423 multiclass sse2_unpack<bits<8> opc, string OpcodeStr, ValueType vt,
   4424                        SDNode OpNode, PatFrag ld_frag, bit Is2Addr = 1> {
   4425   def rr : PDI<opc, MRMSrcReg,
   4426       (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
   4427       !if(Is2Addr,
   4428           !strconcat(OpcodeStr,"\t{$src2, $dst|$dst, $src2}"),
   4429           !strconcat(OpcodeStr,"\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
   4430       [(set VR128:$dst, (vt (OpNode VR128:$src1, VR128:$src2)))],
   4431       IIC_SSE_UNPCK>, Sched<[WriteShuffle]>;
   4432   def rm : PDI<opc, MRMSrcMem,
   4433       (outs VR128:$dst), (ins VR128:$src1, i128mem:$src2),
   4434       !if(Is2Addr,
   4435           !strconcat(OpcodeStr,"\t{$src2, $dst|$dst, $src2}"),
   4436           !strconcat(OpcodeStr,"\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
   4437       [(set VR128:$dst, (vt (OpNode VR128:$src1,
   4438                                   (bitconvert (ld_frag addr:$src2)))))],
   4439                                                IIC_SSE_UNPCK>,
   4440       Sched<[WriteShuffleLd, ReadAfterLd]>;
   4441 }
   4442 
   4443 multiclass sse2_unpack_y<bits<8> opc, string OpcodeStr, ValueType vt,
   4444                          SDNode OpNode> {
   4445   def Yrr : PDI<opc, MRMSrcReg,
   4446       (outs VR256:$dst), (ins VR256:$src1, VR256:$src2),
   4447       !strconcat(OpcodeStr,"\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
   4448       [(set VR256:$dst, (vt (OpNode VR256:$src1, VR256:$src2)))]>,
   4449       Sched<[WriteShuffle]>;
   4450   def Yrm : PDI<opc, MRMSrcMem,
   4451       (outs VR256:$dst), (ins VR256:$src1, i256mem:$src2),
   4452       !strconcat(OpcodeStr,"\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
   4453       [(set VR256:$dst, (vt (OpNode VR256:$src1,
   4454                                   (bitconvert (loadv4i64 addr:$src2)))))]>,
   4455       Sched<[WriteShuffleLd, ReadAfterLd]>;
   4456 }
   4457 
   4458 
   4459 let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
   4460   defm VPUNPCKLBW  : sse2_unpack<0x60, "vpunpcklbw", v16i8, X86Unpckl,
   4461                                  loadv2i64, 0>, VEX_4V;
   4462   defm VPUNPCKLWD  : sse2_unpack<0x61, "vpunpcklwd", v8i16, X86Unpckl,
   4463                                  loadv2i64, 0>, VEX_4V;
   4464   defm VPUNPCKHBW  : sse2_unpack<0x68, "vpunpckhbw", v16i8, X86Unpckh,
   4465                                  loadv2i64, 0>, VEX_4V;
   4466   defm VPUNPCKHWD  : sse2_unpack<0x69, "vpunpckhwd", v8i16, X86Unpckh,
   4467                                  loadv2i64, 0>, VEX_4V;
   4468 }
   4469 let Predicates = [HasAVX, NoVLX] in {
   4470   defm VPUNPCKLDQ  : sse2_unpack<0x62, "vpunpckldq", v4i32, X86Unpckl,
   4471                                  loadv2i64, 0>, VEX_4V;
   4472   defm VPUNPCKLQDQ : sse2_unpack<0x6C, "vpunpcklqdq", v2i64, X86Unpckl,
   4473                                  loadv2i64, 0>, VEX_4V;
   4474   defm VPUNPCKHDQ  : sse2_unpack<0x6A, "vpunpckhdq", v4i32, X86Unpckh,
   4475                                  loadv2i64, 0>, VEX_4V;
   4476   defm VPUNPCKHQDQ : sse2_unpack<0x6D, "vpunpckhqdq", v2i64, X86Unpckh,
   4477                                  loadv2i64, 0>, VEX_4V;
   4478 }
   4479 
   4480 let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
   4481   defm VPUNPCKLBW  : sse2_unpack_y<0x60, "vpunpcklbw", v32i8, X86Unpckl>,
   4482                                    VEX_4V, VEX_L;
   4483   defm VPUNPCKLWD  : sse2_unpack_y<0x61, "vpunpcklwd", v16i16, X86Unpckl>,
   4484                                    VEX_4V, VEX_L;
   4485   defm VPUNPCKHBW  : sse2_unpack_y<0x68, "vpunpckhbw", v32i8, X86Unpckh>,
   4486                                    VEX_4V, VEX_L;
   4487   defm VPUNPCKHWD  : sse2_unpack_y<0x69, "vpunpckhwd", v16i16, X86Unpckh>,
   4488                                    VEX_4V, VEX_L;
   4489 }
   4490 let Predicates = [HasAVX2, NoVLX] in {
   4491   defm VPUNPCKLDQ  : sse2_unpack_y<0x62, "vpunpckldq", v8i32, X86Unpckl>,
   4492                                    VEX_4V, VEX_L;
   4493   defm VPUNPCKLQDQ : sse2_unpack_y<0x6C, "vpunpcklqdq", v4i64, X86Unpckl>,
   4494                                    VEX_4V, VEX_L;
   4495   defm VPUNPCKHDQ  : sse2_unpack_y<0x6A, "vpunpckhdq", v8i32, X86Unpckh>,
   4496                                    VEX_4V, VEX_L;
   4497   defm VPUNPCKHQDQ : sse2_unpack_y<0x6D, "vpunpckhqdq", v4i64, X86Unpckh>,
   4498                                    VEX_4V, VEX_L;
   4499 }
   4500 
   4501 let Constraints = "$src1 = $dst" in {
   4502   defm PUNPCKLBW  : sse2_unpack<0x60, "punpcklbw", v16i8, X86Unpckl,
   4503                                 memopv2i64>;
   4504   defm PUNPCKLWD  : sse2_unpack<0x61, "punpcklwd", v8i16, X86Unpckl,
   4505                                 memopv2i64>;
   4506   defm PUNPCKLDQ  : sse2_unpack<0x62, "punpckldq", v4i32, X86Unpckl,
   4507                                 memopv2i64>;
   4508   defm PUNPCKLQDQ : sse2_unpack<0x6C, "punpcklqdq", v2i64, X86Unpckl,
   4509                                 memopv2i64>;
   4510 
   4511   defm PUNPCKHBW  : sse2_unpack<0x68, "punpckhbw", v16i8, X86Unpckh,
   4512                                 memopv2i64>;
   4513   defm PUNPCKHWD  : sse2_unpack<0x69, "punpckhwd", v8i16, X86Unpckh,
   4514                                 memopv2i64>;
   4515   defm PUNPCKHDQ  : sse2_unpack<0x6A, "punpckhdq", v4i32, X86Unpckh,
   4516                                 memopv2i64>;
   4517   defm PUNPCKHQDQ : sse2_unpack<0x6D, "punpckhqdq", v2i64, X86Unpckh,
   4518                                 memopv2i64>;
   4519 }
   4520 } // ExeDomain = SSEPackedInt
   4521 
   4522 //===---------------------------------------------------------------------===//
   4523 // SSE2 - Packed Integer Extract and Insert
   4524 //===---------------------------------------------------------------------===//
   4525 
   4526 let ExeDomain = SSEPackedInt in {
   4527 multiclass sse2_pinsrw<bit Is2Addr = 1> {
   4528   def rri : Ii8<0xC4, MRMSrcReg,
   4529        (outs VR128:$dst), (ins VR128:$src1,
   4530         GR32orGR64:$src2, u8imm:$src3),
   4531        !if(Is2Addr,
   4532            "pinsrw\t{$src3, $src2, $dst|$dst, $src2, $src3}",
   4533            "vpinsrw\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
   4534        [(set VR128:$dst,
   4535          (X86pinsrw VR128:$src1, GR32orGR64:$src2, imm:$src3))],
   4536        IIC_SSE_PINSRW>, Sched<[WriteShuffle]>;
   4537   def rmi : Ii8<0xC4, MRMSrcMem,
   4538                        (outs VR128:$dst), (ins VR128:$src1,
   4539                         i16mem:$src2, u8imm:$src3),
   4540        !if(Is2Addr,
   4541            "pinsrw\t{$src3, $src2, $dst|$dst, $src2, $src3}",
   4542            "vpinsrw\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
   4543        [(set VR128:$dst,
   4544          (X86pinsrw VR128:$src1, (extloadi16 addr:$src2),
   4545                     imm:$src3))], IIC_SSE_PINSRW>,
   4546        Sched<[WriteShuffleLd, ReadAfterLd]>;
   4547 }
   4548 
   4549 // Extract
   4550 let Predicates = [HasAVX, NoBWI] in
   4551 def VPEXTRWri : Ii8<0xC5, MRMSrcReg,
   4552                     (outs GR32orGR64:$dst), (ins VR128:$src1, u8imm:$src2),
   4553                     "vpextrw\t{$src2, $src1, $dst|$dst, $src1, $src2}",
   4554                     [(set GR32orGR64:$dst, (X86pextrw (v8i16 VR128:$src1),
   4555                                             imm:$src2))]>, PD, VEX,
   4556                 Sched<[WriteShuffle]>;
   4557 def PEXTRWri : PDIi8<0xC5, MRMSrcReg,
   4558                     (outs GR32orGR64:$dst), (ins VR128:$src1, u8imm:$src2),
   4559                     "pextrw\t{$src2, $src1, $dst|$dst, $src1, $src2}",
   4560                     [(set GR32orGR64:$dst, (X86pextrw (v8i16 VR128:$src1),
   4561                                             imm:$src2))], IIC_SSE_PEXTRW>,
   4562                Sched<[WriteShuffleLd, ReadAfterLd]>;
   4563 
   4564 // Insert
   4565 let Predicates = [HasAVX, NoBWI] in
   4566 defm VPINSRW : sse2_pinsrw<0>, PD, VEX_4V;
   4567 
   4568 let Predicates = [UseSSE2], Constraints = "$src1 = $dst" in
   4569 defm PINSRW : sse2_pinsrw, PD;
   4570 
   4571 } // ExeDomain = SSEPackedInt
   4572 
   4573 //===---------------------------------------------------------------------===//
   4574 // SSE2 - Packed Mask Creation
   4575 //===---------------------------------------------------------------------===//
   4576 
   4577 let ExeDomain = SSEPackedInt, SchedRW = [WriteVecLogic] in {
   4578 
   4579 def VPMOVMSKBrr  : VPDI<0xD7, MRMSrcReg, (outs GR32orGR64:$dst),
   4580            (ins VR128:$src),
   4581            "pmovmskb\t{$src, $dst|$dst, $src}",
   4582            [(set GR32orGR64:$dst, (X86movmsk (v16i8 VR128:$src)))],
   4583            IIC_SSE_MOVMSK>, VEX;
   4584 
   4585 let Predicates = [HasAVX2] in {
   4586 def VPMOVMSKBYrr  : VPDI<0xD7, MRMSrcReg, (outs GR32orGR64:$dst),
   4587            (ins VR256:$src),
   4588            "pmovmskb\t{$src, $dst|$dst, $src}",
   4589            [(set GR32orGR64:$dst, (X86movmsk (v32i8 VR256:$src)))]>,
   4590            VEX, VEX_L;
   4591 }
   4592 
   4593 def PMOVMSKBrr : PDI<0xD7, MRMSrcReg, (outs GR32orGR64:$dst), (ins VR128:$src),
   4594            "pmovmskb\t{$src, $dst|$dst, $src}",
   4595            [(set GR32orGR64:$dst, (X86movmsk (v16i8 VR128:$src)))],
   4596            IIC_SSE_MOVMSK>;
   4597 
   4598 } // ExeDomain = SSEPackedInt
   4599 
   4600 //===---------------------------------------------------------------------===//
   4601 // SSE2 - Conditional Store
   4602 //===---------------------------------------------------------------------===//
   4603 
   4604 let ExeDomain = SSEPackedInt, SchedRW = [WriteStore] in {
   4605 
   4606 let Uses = [EDI], Predicates = [HasAVX,Not64BitMode] in
   4607 def VMASKMOVDQU : VPDI<0xF7, MRMSrcReg, (outs),
   4608            (ins VR128:$src, VR128:$mask),
   4609            "maskmovdqu\t{$mask, $src|$src, $mask}",
   4610            [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, EDI)],
   4611            IIC_SSE_MASKMOV>, VEX;
   4612 let Uses = [RDI], Predicates = [HasAVX,In64BitMode] in
   4613 def VMASKMOVDQU64 : VPDI<0xF7, MRMSrcReg, (outs),
   4614            (ins VR128:$src, VR128:$mask),
   4615            "maskmovdqu\t{$mask, $src|$src, $mask}",
   4616            [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, RDI)],
   4617            IIC_SSE_MASKMOV>, VEX;
   4618 
   4619 let Uses = [EDI], Predicates = [UseSSE2,Not64BitMode] in
   4620 def MASKMOVDQU : PDI<0xF7, MRMSrcReg, (outs), (ins VR128:$src, VR128:$mask),
   4621            "maskmovdqu\t{$mask, $src|$src, $mask}",
   4622            [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, EDI)],
   4623            IIC_SSE_MASKMOV>;
   4624 let Uses = [RDI], Predicates = [UseSSE2,In64BitMode] in
   4625 def MASKMOVDQU64 : PDI<0xF7, MRMSrcReg, (outs), (ins VR128:$src, VR128:$mask),
   4626            "maskmovdqu\t{$mask, $src|$src, $mask}",
   4627            [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, RDI)],
   4628            IIC_SSE_MASKMOV>;
   4629 
   4630 } // ExeDomain = SSEPackedInt
   4631 
   4632 //===---------------------------------------------------------------------===//
   4633 // SSE2 - Move Doubleword/Quadword
   4634 //===---------------------------------------------------------------------===//
   4635 
   4636 //===---------------------------------------------------------------------===//
   4637 // Move Int Doubleword to Packed Double Int
   4638 //
   4639 def VMOVDI2PDIrr : VS2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR32:$src),
   4640                       "movd\t{$src, $dst|$dst, $src}",
   4641                       [(set VR128:$dst,
   4642                         (v4i32 (scalar_to_vector GR32:$src)))], IIC_SSE_MOVDQ>,
   4643                         VEX, Sched<[WriteMove]>;
   4644 def VMOVDI2PDIrm : VS2I<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i32mem:$src),
   4645                       "movd\t{$src, $dst|$dst, $src}",
   4646                       [(set VR128:$dst,
   4647                         (v4i32 (scalar_to_vector (loadi32 addr:$src))))],
   4648                         IIC_SSE_MOVDQ>,
   4649                       VEX, Sched<[WriteLoad]>;
   4650 def VMOV64toPQIrr : VRS2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src),
   4651                         "movq\t{$src, $dst|$dst, $src}",
   4652                         [(set VR128:$dst,
   4653                           (v2i64 (scalar_to_vector GR64:$src)))],
   4654                           IIC_SSE_MOVDQ>, VEX, Sched<[WriteMove]>;
   4655 let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayLoad = 1 in
   4656 def VMOV64toPQIrm : VRS2I<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
   4657                         "movq\t{$src, $dst|$dst, $src}",
   4658                         [], IIC_SSE_MOVDQ>, VEX, Sched<[WriteLoad]>;
   4659 let isCodeGenOnly = 1 in
   4660 def VMOV64toSDrr : VRS2I<0x6E, MRMSrcReg, (outs FR64:$dst), (ins GR64:$src),
   4661                        "movq\t{$src, $dst|$dst, $src}",
   4662                        [(set FR64:$dst, (bitconvert GR64:$src))],
   4663                        IIC_SSE_MOVDQ>, VEX, Sched<[WriteMove]>;
   4664 
   4665 def MOVDI2PDIrr : S2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR32:$src),
   4666                       "movd\t{$src, $dst|$dst, $src}",
   4667                       [(set VR128:$dst,
   4668                         (v4i32 (scalar_to_vector GR32:$src)))], IIC_SSE_MOVDQ>,
   4669                   Sched<[WriteMove]>;
   4670 def MOVDI2PDIrm : S2I<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i32mem:$src),
   4671                       "movd\t{$src, $dst|$dst, $src}",
   4672                       [(set VR128:$dst,
   4673                         (v4i32 (scalar_to_vector (loadi32 addr:$src))))],
   4674                         IIC_SSE_MOVDQ>, Sched<[WriteLoad]>;
   4675 def MOV64toPQIrr : RS2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src),
   4676                         "mov{d|q}\t{$src, $dst|$dst, $src}",
   4677                         [(set VR128:$dst,
   4678                           (v2i64 (scalar_to_vector GR64:$src)))],
   4679                           IIC_SSE_MOVDQ>, Sched<[WriteMove]>;
   4680 let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayLoad = 1 in
   4681 def MOV64toPQIrm : RS2I<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
   4682                         "mov{d|q}\t{$src, $dst|$dst, $src}",
   4683                         [], IIC_SSE_MOVDQ>, Sched<[WriteLoad]>;
   4684 let isCodeGenOnly = 1 in
   4685 def MOV64toSDrr : RS2I<0x6E, MRMSrcReg, (outs FR64:$dst), (ins GR64:$src),
   4686                        "mov{d|q}\t{$src, $dst|$dst, $src}",
   4687                        [(set FR64:$dst, (bitconvert GR64:$src))],
   4688                        IIC_SSE_MOVDQ>, Sched<[WriteMove]>;
   4689 
   4690 //===---------------------------------------------------------------------===//
   4691 // Move Int Doubleword to Single Scalar
   4692 //
   4693 let isCodeGenOnly = 1 in {
   4694   def VMOVDI2SSrr  : VS2I<0x6E, MRMSrcReg, (outs FR32:$dst), (ins GR32:$src),
   4695                         "movd\t{$src, $dst|$dst, $src}",
   4696                         [(set FR32:$dst, (bitconvert GR32:$src))],
   4697                         IIC_SSE_MOVDQ>, VEX, Sched<[WriteMove]>;
   4698 
   4699   def VMOVDI2SSrm  : VS2I<0x6E, MRMSrcMem, (outs FR32:$dst), (ins i32mem:$src),
   4700                         "movd\t{$src, $dst|$dst, $src}",
   4701                         [(set FR32:$dst, (bitconvert (loadi32 addr:$src)))],
   4702                         IIC_SSE_MOVDQ>,
   4703                         VEX, Sched<[WriteLoad]>;
   4704   def MOVDI2SSrr  : S2I<0x6E, MRMSrcReg, (outs FR32:$dst), (ins GR32:$src),
   4705                         "movd\t{$src, $dst|$dst, $src}",
   4706                         [(set FR32:$dst, (bitconvert GR32:$src))],
   4707                         IIC_SSE_MOVDQ>, Sched<[WriteMove]>;
   4708 
   4709   def MOVDI2SSrm  : S2I<0x6E, MRMSrcMem, (outs FR32:$dst), (ins i32mem:$src),
   4710                         "movd\t{$src, $dst|$dst, $src}",
   4711                         [(set FR32:$dst, (bitconvert (loadi32 addr:$src)))],
   4712                         IIC_SSE_MOVDQ>, Sched<[WriteLoad]>;
   4713 }
   4714 
   4715 //===---------------------------------------------------------------------===//
   4716 // Move Packed Doubleword Int to Packed Double Int
   4717 //
   4718 def VMOVPDI2DIrr  : VS2I<0x7E, MRMDestReg, (outs GR32:$dst), (ins VR128:$src),
   4719                        "movd\t{$src, $dst|$dst, $src}",
   4720                        [(set GR32:$dst, (extractelt (v4i32 VR128:$src),
   4721                                         (iPTR 0)))], IIC_SSE_MOVD_ToGP>, VEX,
   4722                     Sched<[WriteMove]>;
   4723 def VMOVPDI2DImr  : VS2I<0x7E, MRMDestMem, (outs),
   4724                        (ins i32mem:$dst, VR128:$src),
   4725                        "movd\t{$src, $dst|$dst, $src}",
   4726                        [(store (i32 (extractelt (v4i32 VR128:$src),
   4727                                      (iPTR 0))), addr:$dst)], IIC_SSE_MOVDQ>,
   4728                                      VEX, Sched<[WriteStore]>;
   4729 def MOVPDI2DIrr  : S2I<0x7E, MRMDestReg, (outs GR32:$dst), (ins VR128:$src),
   4730                        "movd\t{$src, $dst|$dst, $src}",
   4731                        [(set GR32:$dst, (extractelt (v4i32 VR128:$src),
   4732                                         (iPTR 0)))], IIC_SSE_MOVD_ToGP>,
   4733                    Sched<[WriteMove]>;
   4734 def MOVPDI2DImr  : S2I<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, VR128:$src),
   4735                        "movd\t{$src, $dst|$dst, $src}",
   4736                        [(store (i32 (extractelt (v4i32 VR128:$src),
   4737                                      (iPTR 0))), addr:$dst)],
   4738                                      IIC_SSE_MOVDQ>, Sched<[WriteStore]>;
   4739 
   4740 def : Pat<(v8i32 (X86Vinsert (v8i32 immAllZerosV), GR32:$src2, (iPTR 0))),
   4741         (SUBREG_TO_REG (i32 0), (VMOVDI2PDIrr GR32:$src2), sub_xmm)>;
   4742 
   4743 def : Pat<(v4i64 (X86Vinsert (bc_v4i64 (v8i32 immAllZerosV)), GR64:$src2, (iPTR 0))),
   4744         (SUBREG_TO_REG (i32 0), (VMOV64toPQIrr GR64:$src2), sub_xmm)>;
   4745 
   4746 def : Pat<(v8i32 (X86Vinsert undef, GR32:$src2, (iPTR 0))),
   4747         (SUBREG_TO_REG (i32 0), (VMOVDI2PDIrr GR32:$src2), sub_xmm)>;
   4748 
   4749 def : Pat<(v4i64 (X86Vinsert undef, GR64:$src2, (iPTR 0))),
   4750         (SUBREG_TO_REG (i32 0), (VMOV64toPQIrr GR64:$src2), sub_xmm)>;
   4751 
   4752 //===---------------------------------------------------------------------===//
   4753 // Move Packed Doubleword Int first element to Doubleword Int
   4754 //
   4755 let SchedRW = [WriteMove] in {
   4756 def VMOVPQIto64rr : VRS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128:$src),
   4757                           "movq\t{$src, $dst|$dst, $src}",
   4758                           [(set GR64:$dst, (extractelt (v2i64 VR128:$src),
   4759                                                         (iPTR 0)))],
   4760                                                            IIC_SSE_MOVD_ToGP>,
   4761                       VEX;
   4762 
   4763 def MOVPQIto64rr : RS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128:$src),
   4764                         "mov{d|q}\t{$src, $dst|$dst, $src}",
   4765                         [(set GR64:$dst, (extractelt (v2i64 VR128:$src),
   4766                                                          (iPTR 0)))],
   4767                                                          IIC_SSE_MOVD_ToGP>;
   4768 } //SchedRW
   4769 
   4770 let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayStore = 1 in
   4771 def VMOVPQIto64rm : VRS2I<0x7E, MRMDestMem, (outs),
   4772                           (ins i64mem:$dst, VR128:$src),
   4773                           "movq\t{$src, $dst|$dst, $src}",
   4774                           [], IIC_SSE_MOVDQ>, VEX, Sched<[WriteStore]>;
   4775 let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayStore = 1 in
   4776 def MOVPQIto64rm : RS2I<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src),
   4777                         "mov{d|q}\t{$src, $dst|$dst, $src}",
   4778                         [], IIC_SSE_MOVDQ>, Sched<[WriteStore]>;
   4779 
   4780 //===---------------------------------------------------------------------===//
   4781 // Bitcast FR64 <-> GR64
   4782 //
   4783 let isCodeGenOnly = 1 in {
   4784   let Predicates = [UseAVX] in
   4785   def VMOV64toSDrm : VS2SI<0x7E, MRMSrcMem, (outs FR64:$dst), (ins i64mem:$src),
   4786                           "movq\t{$src, $dst|$dst, $src}",
   4787                           [(set FR64:$dst, (bitconvert (loadi64 addr:$src)))]>,
   4788                           VEX, Sched<[WriteLoad]>;
   4789   def VMOVSDto64rr : VRS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins FR64:$src),
   4790                            "movq\t{$src, $dst|$dst, $src}",
   4791                            [(set GR64:$dst, (bitconvert FR64:$src))],
   4792                            IIC_SSE_MOVDQ>, VEX, Sched<[WriteMove]>;
   4793   def VMOVSDto64mr : VRS2I<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, FR64:$src),
   4794                            "movq\t{$src, $dst|$dst, $src}",
   4795                            [(store (i64 (bitconvert FR64:$src)), addr:$dst)],
   4796                            IIC_SSE_MOVDQ>, VEX, Sched<[WriteStore]>;
   4797 
   4798   def MOV64toSDrm : S2SI<0x7E, MRMSrcMem, (outs FR64:$dst), (ins i64mem:$src),
   4799                          "movq\t{$src, $dst|$dst, $src}",
   4800                          [(set FR64:$dst, (bitconvert (loadi64 addr:$src)))],
   4801                          IIC_SSE_MOVDQ>, Sched<[WriteLoad]>;
   4802   def MOVSDto64rr : RS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins FR64:$src),
   4803                          "mov{d|q}\t{$src, $dst|$dst, $src}",
   4804                          [(set GR64:$dst, (bitconvert FR64:$src))],
   4805                          IIC_SSE_MOVD_ToGP>, Sched<[WriteMove]>;
   4806   def MOVSDto64mr : RS2I<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, FR64:$src),
   4807                          "movq\t{$src, $dst|$dst, $src}",
   4808                          [(store (i64 (bitconvert FR64:$src)), addr:$dst)],
   4809                          IIC_SSE_MOVDQ>, Sched<[WriteStore]>;
   4810 }
   4811 
   4812 //===---------------------------------------------------------------------===//
   4813 // Move Scalar Single to Double Int
   4814 //
   4815 let isCodeGenOnly = 1 in {
   4816   def VMOVSS2DIrr  : VS2I<0x7E, MRMDestReg, (outs GR32:$dst), (ins FR32:$src),
   4817                         "movd\t{$src, $dst|$dst, $src}",
   4818                         [(set GR32:$dst, (bitconvert FR32:$src))],
   4819                         IIC_SSE_MOVD_ToGP>, VEX, Sched<[WriteMove]>;
   4820   def VMOVSS2DImr  : VS2I<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, FR32:$src),
   4821                         "movd\t{$src, $dst|$dst, $src}",
   4822                         [(store (i32 (bitconvert FR32:$src)), addr:$dst)],
   4823                         IIC_SSE_MOVDQ>, VEX, Sched<[WriteStore]>;
   4824   def MOVSS2DIrr  : S2I<0x7E, MRMDestReg, (outs GR32:$dst), (ins FR32:$src),
   4825                         "movd\t{$src, $dst|$dst, $src}",
   4826                         [(set GR32:$dst, (bitconvert FR32:$src))],
   4827                         IIC_SSE_MOVD_ToGP>, Sched<[WriteMove]>;
   4828   def MOVSS2DImr  : S2I<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, FR32:$src),
   4829                         "movd\t{$src, $dst|$dst, $src}",
   4830                         [(store (i32 (bitconvert FR32:$src)), addr:$dst)],
   4831                         IIC_SSE_MOVDQ>, Sched<[WriteStore]>;
   4832 }
   4833 
   4834 let Predicates = [UseAVX] in {
   4835   let AddedComplexity = 15 in {
   4836     def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector GR32:$src)))),
   4837               (VMOVDI2PDIrr GR32:$src)>;
   4838 
   4839     def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector GR64:$src)))),
   4840               (VMOV64toPQIrr GR64:$src)>;
   4841 
   4842     def : Pat<(v4i64 (X86vzmovl (insert_subvector undef,
   4843                 (v2i64 (scalar_to_vector GR64:$src)),(iPTR 0)))),
   4844               (SUBREG_TO_REG (i64 0), (VMOV64toPQIrr GR64:$src), sub_xmm)>;
   4845   }
   4846   // AVX 128-bit movd/movq instructions write zeros in the high 128-bit part.
   4847   // These instructions also write zeros in the high part of a 256-bit register.
   4848   let AddedComplexity = 20 in {
   4849     def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector (loadi32 addr:$src))))),
   4850               (VMOVDI2PDIrm addr:$src)>;
   4851     def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv4f32 addr:$src)))),
   4852               (VMOVDI2PDIrm addr:$src)>;
   4853     def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv2i64 addr:$src)))),
   4854               (VMOVDI2PDIrm addr:$src)>;
   4855     def : Pat<(v8i32 (X86vzmovl (insert_subvector undef,
   4856                 (v4i32 (scalar_to_vector (loadi32 addr:$src))), (iPTR 0)))),
   4857               (SUBREG_TO_REG (i32 0), (VMOVDI2PDIrm addr:$src), sub_xmm)>;
   4858   }
   4859   // Use regular 128-bit instructions to match 256-bit scalar_to_vec+zext.
   4860   def : Pat<(v8i32 (X86vzmovl (insert_subvector undef,
   4861                                (v4i32 (scalar_to_vector GR32:$src)),(iPTR 0)))),
   4862             (SUBREG_TO_REG (i32 0), (VMOVDI2PDIrr GR32:$src), sub_xmm)>;
   4863 }
   4864 
   4865 let Predicates = [UseSSE2] in {
   4866   let AddedComplexity = 15 in {
   4867     def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector GR32:$src)))),
   4868               (MOVDI2PDIrr GR32:$src)>;
   4869 
   4870     def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector GR64:$src)))),
   4871               (MOV64toPQIrr GR64:$src)>;
   4872   }
   4873   let AddedComplexity = 20 in {
   4874     def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector (loadi32 addr:$src))))),
   4875               (MOVDI2PDIrm addr:$src)>;
   4876     def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv4f32 addr:$src)))),
   4877               (MOVDI2PDIrm addr:$src)>;
   4878     def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv2i64 addr:$src)))),
   4879               (MOVDI2PDIrm addr:$src)>;
   4880   }
   4881 }
   4882 
   4883 // These are the correct encodings of the instructions so that we know how to
   4884 // read correct assembly, even though we continue to emit the wrong ones for
   4885 // compatibility with Darwin's buggy assembler.
   4886 def : InstAlias<"movq\t{$src, $dst|$dst, $src}",
   4887                 (MOV64toPQIrr VR128:$dst, GR64:$src), 0>;
   4888 def : InstAlias<"movq\t{$src, $dst|$dst, $src}",
   4889                 (MOVPQIto64rr GR64:$dst, VR128:$src), 0>;
   4890 // Allow "vmovd" but print "vmovq" since we don't need compatibility for AVX.
   4891 def : InstAlias<"vmovd\t{$src, $dst|$dst, $src}",
   4892                 (VMOV64toPQIrr VR128:$dst, GR64:$src), 0>;
   4893 def : InstAlias<"vmovd\t{$src, $dst|$dst, $src}",
   4894                 (VMOVPQIto64rr GR64:$dst, VR128:$src), 0>;
   4895 
   4896 //===---------------------------------------------------------------------===//
   4897 // SSE2 - Move Quadword
   4898 //===---------------------------------------------------------------------===//
   4899 
   4900 //===---------------------------------------------------------------------===//
   4901 // Move Quadword Int to Packed Quadword Int
   4902 //
   4903 
   4904 let ExeDomain = SSEPackedInt, SchedRW = [WriteLoad] in {
   4905 def VMOVQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
   4906                     "vmovq\t{$src, $dst|$dst, $src}",
   4907                     [(set VR128:$dst,
   4908                       (v2i64 (scalar_to_vector (loadi64 addr:$src))))]>, XS,
   4909                     VEX, Requires<[UseAVX]>;
   4910 def MOVQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
   4911                     "movq\t{$src, $dst|$dst, $src}",
   4912                     [(set VR128:$dst,
   4913                       (v2i64 (scalar_to_vector (loadi64 addr:$src))))],
   4914                       IIC_SSE_MOVDQ>, XS,
   4915                     Requires<[UseSSE2]>; // SSE2 instruction with XS Prefix
   4916 } // ExeDomain, SchedRW
   4917 
   4918 //===---------------------------------------------------------------------===//
   4919 // Move Packed Quadword Int to Quadword Int
   4920 //
   4921 let ExeDomain = SSEPackedInt, SchedRW = [WriteStore] in {
   4922 def VMOVPQI2QImr : VS2I<0xD6, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src),
   4923                       "movq\t{$src, $dst|$dst, $src}",
   4924                       [(store (i64 (extractelt (v2i64 VR128:$src),
   4925                                     (iPTR 0))), addr:$dst)],
   4926                                     IIC_SSE_MOVDQ>, VEX;
   4927 def MOVPQI2QImr : S2I<0xD6, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src),
   4928                       "movq\t{$src, $dst|$dst, $src}",
   4929                       [(store (i64 (extractelt (v2i64 VR128:$src),
   4930                                     (iPTR 0))), addr:$dst)],
   4931                                     IIC_SSE_MOVDQ>;
   4932 } // ExeDomain, SchedRW
   4933 
   4934 // For disassembler only
   4935 let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0,
   4936     SchedRW = [WriteVecLogic] in {
   4937 def VMOVPQI2QIrr : VS2I<0xD6, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
   4938                      "movq\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVQ_RR>, VEX;
   4939 def MOVPQI2QIrr : S2I<0xD6, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
   4940                       "movq\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVQ_RR>;
   4941 }
   4942 
   4943 // Aliases to help the assembler pick two byte VEX encodings by swapping the
   4944 // operands relative to the normal instructions to use VEX.R instead of VEX.B.
   4945 def : InstAlias<"vmovq\t{$src, $dst|$dst, $src}",
   4946                 (VMOVPQI2QIrr VR128L:$dst, VR128H:$src), 0>;
   4947 
   4948 //===---------------------------------------------------------------------===//
   4949 // Store / copy lower 64-bits of a XMM register.
   4950 //
   4951 let ExeDomain = SSEPackedInt, isCodeGenOnly = 1, AddedComplexity = 20 in {
   4952 def VMOVZQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
   4953                      "vmovq\t{$src, $dst|$dst, $src}",
   4954                      [(set VR128:$dst,
   4955                        (v2i64 (X86vzmovl (v2i64 (scalar_to_vector
   4956                                                  (loadi64 addr:$src))))))],
   4957                                                  IIC_SSE_MOVDQ>,
   4958                      XS, VEX, Requires<[UseAVX]>, Sched<[WriteLoad]>;
   4959 
   4960 def MOVZQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
   4961                      "movq\t{$src, $dst|$dst, $src}",
   4962                      [(set VR128:$dst,
   4963                        (v2i64 (X86vzmovl (v2i64 (scalar_to_vector
   4964                                                  (loadi64 addr:$src))))))],
   4965                                                  IIC_SSE_MOVDQ>,
   4966                      XS, Requires<[UseSSE2]>, Sched<[WriteLoad]>;
   4967 } // ExeDomain, isCodeGenOnly, AddedComplexity
   4968 
   4969 let Predicates = [UseAVX], AddedComplexity = 20 in {
   4970   def : Pat<(v2i64 (X86vzmovl (bc_v2i64 (loadv4f32 addr:$src)))),
   4971             (VMOVZQI2PQIrm addr:$src)>;
   4972   def : Pat<(v2i64 (X86vzload addr:$src)),
   4973             (VMOVZQI2PQIrm addr:$src)>;
   4974   def : Pat<(v4i64 (X86vzmovl (insert_subvector undef,
   4975               (v2i64 (scalar_to_vector (loadi64 addr:$src))), (iPTR 0)))),
   4976             (SUBREG_TO_REG (i64 0), (VMOVZQI2PQIrm addr:$src), sub_xmm)>;
   4977   def : Pat<(v4i64 (X86vzload addr:$src)),
   4978             (SUBREG_TO_REG (i64 0), (VMOVZQI2PQIrm addr:$src), sub_xmm)>;
   4979 }
   4980 
   4981 let Predicates = [UseSSE2], AddedComplexity = 20 in {
   4982   def : Pat<(v2i64 (X86vzmovl (bc_v2i64 (loadv4f32 addr:$src)))),
   4983             (MOVZQI2PQIrm addr:$src)>;
   4984   def : Pat<(v2i64 (X86vzload addr:$src)), (MOVZQI2PQIrm addr:$src)>;
   4985 }
   4986 
   4987 //===---------------------------------------------------------------------===//
   4988 // Moving from XMM to XMM and clear upper 64 bits. Note, there is a bug in
   4989 // IA32 document. movq xmm1, xmm2 does clear the high bits.
   4990 //
   4991 let ExeDomain = SSEPackedInt, SchedRW = [WriteVecLogic] in {
   4992 let AddedComplexity = 15 in
   4993 def VMOVZPQILo2PQIrr : I<0x7E, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
   4994                         "vmovq\t{$src, $dst|$dst, $src}",
   4995                     [(set VR128:$dst, (v2i64 (X86vzmovl (v2i64 VR128:$src))))],
   4996                     IIC_SSE_MOVQ_RR>,
   4997                       XS, VEX, Requires<[UseAVX]>;
   4998 let AddedComplexity = 15 in
   4999 def MOVZPQILo2PQIrr : I<0x7E, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
   5000                         "movq\t{$src, $dst|$dst, $src}",
   5001                     [(set VR128:$dst, (v2i64 (X86vzmovl (v2i64 VR128:$src))))],
   5002                     IIC_SSE_MOVQ_RR>,
   5003                       XS, Requires<[UseSSE2]>;
   5004 } // ExeDomain, SchedRW
   5005 
   5006 let ExeDomain = SSEPackedInt, isCodeGenOnly = 1, SchedRW = [WriteVecLogicLd] in {
   5007 let AddedComplexity = 20 in
   5008 def VMOVZPQILo2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
   5009                         "vmovq\t{$src, $dst|$dst, $src}",
   5010                     [(set VR128:$dst, (v2i64 (X86vzmovl
   5011                                              (loadv2i64 addr:$src))))],
   5012                                              IIC_SSE_MOVDQ>,
   5013                       XS, VEX, Requires<[UseAVX]>;
   5014 let AddedComplexity = 20 in {
   5015 def MOVZPQILo2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
   5016                         "movq\t{$src, $dst|$dst, $src}",
   5017                     [(set VR128:$dst, (v2i64 (X86vzmovl
   5018                                              (loadv2i64 addr:$src))))],
   5019                                              IIC_SSE_MOVDQ>,
   5020                       XS, Requires<[UseSSE2]>;
   5021 }
   5022 } // ExeDomain, isCodeGenOnly, SchedRW
   5023 
   5024 let AddedComplexity = 20 in {
   5025   let Predicates = [UseAVX] in {
   5026     def : Pat<(v2f64 (X86vzmovl (v2f64 VR128:$src))),
   5027               (VMOVZPQILo2PQIrr VR128:$src)>;
   5028   }
   5029   let Predicates = [UseSSE2] in {
   5030     def : Pat<(v2f64 (X86vzmovl (v2f64 VR128:$src))),
   5031               (MOVZPQILo2PQIrr VR128:$src)>;
   5032   }
   5033 }
   5034 
   5035 //===---------------------------------------------------------------------===//
   5036 // SSE3 - Replicate Single FP - MOVSHDUP and MOVSLDUP
   5037 //===---------------------------------------------------------------------===//
   5038 multiclass sse3_replicate_sfp<bits<8> op, SDNode OpNode, string OpcodeStr,
   5039                               ValueType vt, RegisterClass RC, PatFrag mem_frag,
   5040                               X86MemOperand x86memop> {
   5041 def rr : S3SI<op, MRMSrcReg, (outs RC:$dst), (ins RC:$src),
   5042                     !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
   5043                       [(set RC:$dst, (vt (OpNode RC:$src)))],
   5044                       IIC_SSE_MOV_LH>, Sched<[WriteFShuffle]>;
   5045 def rm : S3SI<op, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
   5046                     !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
   5047                       [(set RC:$dst, (OpNode (mem_frag addr:$src)))],
   5048                       IIC_SSE_MOV_LH>, Sched<[WriteLoad]>;
   5049 }
   5050 
   5051 let Predicates = [HasAVX, NoVLX] in {
   5052   defm VMOVSHDUP  : sse3_replicate_sfp<0x16, X86Movshdup, "vmovshdup",
   5053                                        v4f32, VR128, loadv4f32, f128mem>, VEX;
   5054   defm VMOVSLDUP  : sse3_replicate_sfp<0x12, X86Movsldup, "vmovsldup",
   5055                                        v4f32, VR128, loadv4f32, f128mem>, VEX;
   5056   defm VMOVSHDUPY : sse3_replicate_sfp<0x16, X86Movshdup, "vmovshdup",
   5057                                  v8f32, VR256, loadv8f32, f256mem>, VEX, VEX_L;
   5058   defm VMOVSLDUPY : sse3_replicate_sfp<0x12, X86Movsldup, "vmovsldup",
   5059                                  v8f32, VR256, loadv8f32, f256mem>, VEX, VEX_L;
   5060 }
   5061 defm MOVSHDUP : sse3_replicate_sfp<0x16, X86Movshdup, "movshdup", v4f32, VR128,
   5062                                    memopv4f32, f128mem>;
   5063 defm MOVSLDUP : sse3_replicate_sfp<0x12, X86Movsldup, "movsldup", v4f32, VR128,
   5064                                    memopv4f32, f128mem>;
   5065 
   5066 let Predicates = [HasAVX, NoVLX] in {
   5067   def : Pat<(v4i32 (X86Movshdup VR128:$src)),
   5068             (VMOVSHDUPrr VR128:$src)>;
   5069   def : Pat<(v4i32 (X86Movshdup (bc_v4i32 (loadv2i64 addr:$src)))),
   5070             (VMOVSHDUPrm addr:$src)>;
   5071   def : Pat<(v4i32 (X86Movsldup VR128:$src)),
   5072             (VMOVSLDUPrr VR128:$src)>;
   5073   def : Pat<(v4i32 (X86Movsldup (bc_v4i32 (loadv2i64 addr:$src)))),
   5074             (VMOVSLDUPrm addr:$src)>;
   5075   def : Pat<(v8i32 (X86Movshdup VR256:$src)),
   5076             (VMOVSHDUPYrr VR256:$src)>;
   5077   def : Pat<(v8i32 (X86Movshdup (bc_v8i32 (loadv4i64 addr:$src)))),
   5078             (VMOVSHDUPYrm addr:$src)>;
   5079   def : Pat<(v8i32 (X86Movsldup VR256:$src)),
   5080             (VMOVSLDUPYrr VR256:$src)>;
   5081   def : Pat<(v8i32 (X86Movsldup (bc_v8i32 (loadv4i64 addr:$src)))),
   5082             (VMOVSLDUPYrm addr:$src)>;
   5083 }
   5084 
   5085 let Predicates = [UseSSE3] in {
   5086   def : Pat<(v4i32 (X86Movshdup VR128:$src)),
   5087             (MOVSHDUPrr VR128:$src)>;
   5088   def : Pat<(v4i32 (X86Movshdup (bc_v4i32 (memopv2i64 addr:$src)))),
   5089             (MOVSHDUPrm addr:$src)>;
   5090   def : Pat<(v4i32 (X86Movsldup VR128:$src)),
   5091             (MOVSLDUPrr VR128:$src)>;
   5092   def : Pat<(v4i32 (X86Movsldup (bc_v4i32 (memopv2i64 addr:$src)))),
   5093             (MOVSLDUPrm addr:$src)>;
   5094 }
   5095 
   5096 //===---------------------------------------------------------------------===//
   5097 // SSE3 - Replicate Double FP - MOVDDUP
   5098 //===---------------------------------------------------------------------===//
   5099 
   5100 multiclass sse3_replicate_dfp<string OpcodeStr> {
   5101 def rr  : S3DI<0x12, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
   5102                     !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
   5103                     [(set VR128:$dst, (v2f64 (X86Movddup VR128:$src)))],
   5104                     IIC_SSE_MOV_LH>, Sched<[WriteFShuffle]>;
   5105 def rm  : S3DI<0x12, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src),
   5106                     !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
   5107                     [(set VR128:$dst,
   5108                       (v2f64 (X86Movddup
   5109                               (scalar_to_vector (loadf64 addr:$src)))))],
   5110                               IIC_SSE_MOV_LH>, Sched<[WriteLoad]>;
   5111 }
   5112 
   5113 // FIXME: Merge with above classe when there're patterns for the ymm version
   5114 multiclass sse3_replicate_dfp_y<string OpcodeStr> {
   5115 def rr  : S3DI<0x12, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
   5116                     !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
   5117                     [(set VR256:$dst, (v4f64 (X86Movddup VR256:$src)))]>,
   5118                     Sched<[WriteFShuffle]>;
   5119 def rm  : S3DI<0x12, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
   5120                     !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
   5121                     [(set VR256:$dst,
   5122                       (v4f64 (X86Movddup (loadv4f64 addr:$src))))]>,
   5123                     Sched<[WriteLoad]>;
   5124 }
   5125 
   5126 let Predicates = [HasAVX, NoVLX] in {
   5127   defm VMOVDDUP  : sse3_replicate_dfp<"vmovddup">, VEX;
   5128   defm VMOVDDUPY : sse3_replicate_dfp_y<"vmovddup">, VEX, VEX_L;
   5129 }
   5130 
   5131 defm MOVDDUP : sse3_replicate_dfp<"movddup">;
   5132 
   5133 
   5134 let Predicates = [HasAVX, NoVLX] in {
   5135   def : Pat<(X86Movddup (loadv2f64 addr:$src)),
   5136             (VMOVDDUPrm addr:$src)>, Requires<[HasAVX]>;
   5137 
   5138   // 256-bit version
   5139   def : Pat<(X86Movddup (loadv4i64 addr:$src)),
   5140             (VMOVDDUPYrm addr:$src)>;
   5141   def : Pat<(X86Movddup (v4i64 VR256:$src)),
   5142             (VMOVDDUPYrr VR256:$src)>;
   5143 }
   5144 
   5145 let Predicates = [HasAVX] in {
   5146   def : Pat<(X86Movddup (bc_v2f64 (loadv4f32 addr:$src))),
   5147             (VMOVDDUPrm addr:$src)>, Requires<[HasAVX]>;
   5148   def : Pat<(X86Movddup (bc_v2f64 (loadv2i64 addr:$src))),
   5149             (VMOVDDUPrm addr:$src)>, Requires<[HasAVX]>;
   5150   def : Pat<(X86Movddup (bc_v2f64
   5151                              (v2i64 (scalar_to_vector (loadi64 addr:$src))))),
   5152             (VMOVDDUPrm addr:$src)>, Requires<[HasAVX]>;
   5153 }
   5154 
   5155 let Predicates = [UseAVX, OptForSize] in {
   5156   def : Pat<(v2f64 (X86VBroadcast (loadf64 addr:$src))),
   5157             (VMOVDDUPrm addr:$src)>;
   5158   def : Pat<(v2i64 (X86VBroadcast (loadi64 addr:$src))),
   5159             (VMOVDDUPrm addr:$src)>;
   5160 }
   5161 
   5162 let Predicates = [UseSSE3] in {
   5163   def : Pat<(X86Movddup (memopv2f64 addr:$src)),
   5164             (MOVDDUPrm addr:$src)>;
   5165   def : Pat<(X86Movddup (bc_v2f64 (memopv4f32 addr:$src))),
   5166             (MOVDDUPrm addr:$src)>;
   5167   def : Pat<(X86Movddup (bc_v2f64 (memopv2i64 addr:$src))),
   5168             (MOVDDUPrm addr:$src)>;
   5169   def : Pat<(X86Movddup (bc_v2f64
   5170                              (v2i64 (scalar_to_vector (loadi64 addr:$src))))),
   5171             (MOVDDUPrm addr:$src)>;
   5172 }
   5173 
   5174 //===---------------------------------------------------------------------===//
   5175 // SSE3 - Move Unaligned Integer
   5176 //===---------------------------------------------------------------------===//
   5177 
   5178 let SchedRW = [WriteLoad] in {
   5179 let Predicates = [HasAVX] in {
   5180   def VLDDQUrm : S3DI<0xF0, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
   5181                    "vlddqu\t{$src, $dst|$dst, $src}",
   5182                    [(set VR128:$dst, (int_x86_sse3_ldu_dq addr:$src))]>, VEX;
   5183   def VLDDQUYrm : S3DI<0xF0, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src),
   5184                    "vlddqu\t{$src, $dst|$dst, $src}",
   5185                    [(set VR256:$dst, (int_x86_avx_ldu_dq_256 addr:$src))]>,
   5186                    VEX, VEX_L;
   5187 }
   5188 def LDDQUrm : S3DI<0xF0, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
   5189                    "lddqu\t{$src, $dst|$dst, $src}",
   5190                    [(set VR128:$dst, (int_x86_sse3_ldu_dq addr:$src))],
   5191                    IIC_SSE_LDDQU>;
   5192 }
   5193 
   5194 //===---------------------------------------------------------------------===//
   5195 // SSE3 - Arithmetic
   5196 //===---------------------------------------------------------------------===//
   5197 
   5198 multiclass sse3_addsub<Intrinsic Int, string OpcodeStr, RegisterClass RC,
   5199                        X86MemOperand x86memop, OpndItins itins,
   5200                        PatFrag ld_frag, bit Is2Addr = 1> {
   5201   def rr : I<0xD0, MRMSrcReg,
   5202        (outs RC:$dst), (ins RC:$src1, RC:$src2),
   5203        !if(Is2Addr,
   5204            !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
   5205            !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
   5206        [(set RC:$dst, (Int RC:$src1, RC:$src2))], itins.rr>,
   5207        Sched<[itins.Sched]>;
   5208   def rm : I<0xD0, MRMSrcMem,
   5209        (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
   5210        !if(Is2Addr,
   5211            !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
   5212            !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
   5213        [(set RC:$dst, (Int RC:$src1, (ld_frag addr:$src2)))], itins.rr>,
   5214        Sched<[itins.Sched.Folded, ReadAfterLd]>;
   5215 }
   5216 
   5217 let Predicates = [HasAVX] in {
   5218   let ExeDomain = SSEPackedSingle in {
   5219     defm VADDSUBPS : sse3_addsub<int_x86_sse3_addsub_ps, "vaddsubps", VR128,
   5220                                f128mem, SSE_ALU_F32P, loadv4f32, 0>, XD, VEX_4V;
   5221     defm VADDSUBPSY : sse3_addsub<int_x86_avx_addsub_ps_256, "vaddsubps", VR256,
   5222                         f256mem, SSE_ALU_F32P, loadv8f32, 0>, XD, VEX_4V, VEX_L;
   5223   }
   5224   let ExeDomain = SSEPackedDouble in {
   5225     defm VADDSUBPD : sse3_addsub<int_x86_sse3_addsub_pd, "vaddsubpd", VR128,
   5226                                f128mem, SSE_ALU_F64P, loadv2f64, 0>, PD, VEX_4V;
   5227     defm VADDSUBPDY : sse3_addsub<int_x86_avx_addsub_pd_256, "vaddsubpd", VR256,
   5228                         f256mem, SSE_ALU_F64P, loadv4f64, 0>, PD, VEX_4V, VEX_L;
   5229   }
   5230 }
   5231 let Constraints = "$src1 = $dst", Predicates = [UseSSE3] in {
   5232   let ExeDomain = SSEPackedSingle in
   5233   defm ADDSUBPS : sse3_addsub<int_x86_sse3_addsub_ps, "addsubps", VR128,
   5234                               f128mem, SSE_ALU_F32P, memopv4f32>, XD;
   5235   let ExeDomain = SSEPackedDouble in
   5236   defm ADDSUBPD : sse3_addsub<int_x86_sse3_addsub_pd, "addsubpd", VR128,
   5237                               f128mem, SSE_ALU_F64P, memopv2f64>, PD;
   5238 }
   5239 
   5240 // Patterns used to select 'addsub' instructions.
   5241 let Predicates = [HasAVX] in {
   5242   def : Pat<(v4f32 (X86Addsub (v4f32 VR128:$lhs), (v4f32 VR128:$rhs))),
   5243             (VADDSUBPSrr VR128:$lhs, VR128:$rhs)>;
   5244   def : Pat<(v4f32 (X86Addsub (v4f32 VR128:$lhs), (loadv4f32 addr:$rhs))),
   5245             (VADDSUBPSrm VR128:$lhs, f128mem:$rhs)>;
   5246   def : Pat<(v2f64 (X86Addsub (v2f64 VR128:$lhs), (v2f64 VR128:$rhs))),
   5247             (VADDSUBPDrr VR128:$lhs, VR128:$rhs)>;
   5248   def : Pat<(v2f64 (X86Addsub (v2f64 VR128:$lhs), (loadv2f64 addr:$rhs))),
   5249             (VADDSUBPDrm VR128:$lhs, f128mem:$rhs)>;
   5250 
   5251   def : Pat<(v8f32 (X86Addsub (v8f32 VR256:$lhs), (v8f32 VR256:$rhs))),
   5252             (VADDSUBPSYrr VR256:$lhs, VR256:$rhs)>;
   5253   def : Pat<(v8f32 (X86Addsub (v8f32 VR256:$lhs), (loadv8f32 addr:$rhs))),
   5254             (VADDSUBPSYrm VR256:$lhs, f256mem:$rhs)>;
   5255   def : Pat<(v4f64 (X86Addsub (v4f64 VR256:$lhs), (v4f64 VR256:$rhs))),
   5256             (VADDSUBPDYrr VR256:$lhs, VR256:$rhs)>;
   5257   def : Pat<(v4f64 (X86Addsub (v4f64 VR256:$lhs), (loadv4f64 addr:$rhs))),
   5258             (VADDSUBPDYrm VR256:$lhs, f256mem:$rhs)>;
   5259 }
   5260 
   5261 let Predicates = [UseSSE3] in {
   5262   def : Pat<(v4f32 (X86Addsub (v4f32 VR128:$lhs), (v4f32 VR128:$rhs))),
   5263             (ADDSUBPSrr VR128:$lhs, VR128:$rhs)>;
   5264   def : Pat<(v4f32 (X86Addsub (v4f32 VR128:$lhs), (memopv4f32 addr:$rhs))),
   5265             (ADDSUBPSrm VR128:$lhs, f128mem:$rhs)>;
   5266   def : Pat<(v2f64 (X86Addsub (v2f64 VR128:$lhs), (v2f64 VR128:$rhs))),
   5267             (ADDSUBPDrr VR128:$lhs, VR128:$rhs)>;
   5268   def : Pat<(v2f64 (X86Addsub (v2f64 VR128:$lhs), (memopv2f64 addr:$rhs))),
   5269             (ADDSUBPDrm VR128:$lhs, f128mem:$rhs)>;
   5270 }
   5271 
   5272 //===---------------------------------------------------------------------===//
   5273 // SSE3 Instructions
   5274 //===---------------------------------------------------------------------===//
   5275 
   5276 // Horizontal ops
   5277 multiclass S3D_Int<bits<8> o, string OpcodeStr, ValueType vt, RegisterClass RC,
   5278                    X86MemOperand x86memop, SDNode OpNode, PatFrag ld_frag,
   5279                    bit Is2Addr = 1> {
   5280   def rr : S3DI<o, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
   5281        !if(Is2Addr,
   5282          !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
   5283          !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
   5284       [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))], IIC_SSE_HADDSUB_RR>,
   5285       Sched<[WriteFAdd]>;
   5286 
   5287   def rm : S3DI<o, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
   5288        !if(Is2Addr,
   5289          !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
   5290          !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
   5291       [(set RC:$dst, (vt (OpNode RC:$src1, (ld_frag addr:$src2))))],
   5292         IIC_SSE_HADDSUB_RM>, Sched<[WriteFAddLd, ReadAfterLd]>;
   5293 }
   5294 multiclass S3_Int<bits<8> o, string OpcodeStr, ValueType vt, RegisterClass RC,
   5295                   X86MemOperand x86memop, SDNode OpNode, PatFrag ld_frag,
   5296                   bit Is2Addr = 1> {
   5297   def rr : S3I<o, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
   5298        !if(Is2Addr,
   5299          !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
   5300          !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
   5301       [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))], IIC_SSE_HADDSUB_RR>,
   5302       Sched<[WriteFAdd]>;
   5303 
   5304   def rm : S3I<o, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
   5305        !if(Is2Addr,
   5306          !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
   5307          !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
   5308       [(set RC:$dst, (vt (OpNode RC:$src1, (ld_frag addr:$src2))))],
   5309         IIC_SSE_HADDSUB_RM>, Sched<[WriteFAddLd, ReadAfterLd]>;
   5310 }
   5311 
   5312 let Predicates = [HasAVX] in {
   5313   let ExeDomain = SSEPackedSingle in {
   5314     defm VHADDPS  : S3D_Int<0x7C, "vhaddps", v4f32, VR128, f128mem,
   5315                             X86fhadd, loadv4f32, 0>, VEX_4V;
   5316     defm VHSUBPS  : S3D_Int<0x7D, "vhsubps", v4f32, VR128, f128mem,
   5317                             X86fhsub, loadv4f32, 0>, VEX_4V;
   5318     defm VHADDPSY : S3D_Int<0x7C, "vhaddps", v8f32, VR256, f256mem,
   5319                             X86fhadd, loadv8f32, 0>, VEX_4V, VEX_L;
   5320     defm VHSUBPSY : S3D_Int<0x7D, "vhsubps", v8f32, VR256, f256mem,
   5321                             X86fhsub, loadv8f32, 0>, VEX_4V, VEX_L;
   5322   }
   5323   let ExeDomain = SSEPackedDouble in {
   5324     defm VHADDPD  : S3_Int <0x7C, "vhaddpd", v2f64, VR128, f128mem,
   5325                             X86fhadd, loadv2f64, 0>, VEX_4V;
   5326     defm VHSUBPD  : S3_Int <0x7D, "vhsubpd", v2f64, VR128, f128mem,
   5327                             X86fhsub, loadv2f64, 0>, VEX_4V;
   5328     defm VHADDPDY : S3_Int <0x7C, "vhaddpd", v4f64, VR256, f256mem,
   5329                             X86fhadd, loadv4f64, 0>, VEX_4V, VEX_L;
   5330     defm VHSUBPDY : S3_Int <0x7D, "vhsubpd", v4f64, VR256, f256mem,
   5331                             X86fhsub, loadv4f64, 0>, VEX_4V, VEX_L;
   5332   }
   5333 }
   5334 
   5335 let Constraints = "$src1 = $dst" in {
   5336   let ExeDomain = SSEPackedSingle in {
   5337     defm HADDPS : S3D_Int<0x7C, "haddps", v4f32, VR128, f128mem, X86fhadd,
   5338                           memopv4f32>;
   5339     defm HSUBPS : S3D_Int<0x7D, "hsubps", v4f32, VR128, f128mem, X86fhsub,
   5340                           memopv4f32>;
   5341   }
   5342   let ExeDomain = SSEPackedDouble in {
   5343     defm HADDPD : S3_Int<0x7C, "haddpd", v2f64, VR128, f128mem, X86fhadd,
   5344                          memopv2f64>;
   5345     defm HSUBPD : S3_Int<0x7D, "hsubpd", v2f64, VR128, f128mem, X86fhsub,
   5346                          memopv2f64>;
   5347   }
   5348 }
   5349 
   5350 //===---------------------------------------------------------------------===//
   5351 // SSSE3 - Packed Absolute Instructions
   5352 //===---------------------------------------------------------------------===//
   5353 
   5354 
   5355 /// SS3I_unop_rm_int - Simple SSSE3 unary op whose type can be v*{i8,i16,i32}.
   5356 multiclass SS3I_unop_rm<bits<8> opc, string OpcodeStr, ValueType vt,
   5357                         SDNode OpNode, PatFrag ld_frag> {
   5358   def rr128 : SS38I<opc, MRMSrcReg, (outs VR128:$dst),
   5359                     (ins VR128:$src),
   5360                     !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
   5361                     [(set VR128:$dst, (vt (OpNode VR128:$src)))],
   5362                     IIC_SSE_PABS_RR>, Sched<[WriteVecALU]>;
   5363 
   5364   def rm128 : SS38I<opc, MRMSrcMem, (outs VR128:$dst),
   5365                     (ins i128mem:$src),
   5366                     !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
   5367                     [(set VR128:$dst,
   5368                       (vt (OpNode (bitconvert (ld_frag addr:$src)))))],
   5369                     IIC_SSE_PABS_RM>, Sched<[WriteVecALULd]>;
   5370 }
   5371 
   5372 /// SS3I_unop_rm_int_y - Simple SSSE3 unary op whose type can be v*{i8,i16,i32}.
   5373 multiclass SS3I_unop_rm_y<bits<8> opc, string OpcodeStr, ValueType vt,
   5374                           SDNode OpNode> {
   5375   def rr256 : SS38I<opc, MRMSrcReg, (outs VR256:$dst),
   5376                     (ins VR256:$src),
   5377                     !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
   5378                     [(set VR256:$dst, (vt (OpNode VR256:$src)))]>,
   5379                     Sched<[WriteVecALU]>;
   5380 
   5381   def rm256 : SS38I<opc, MRMSrcMem, (outs VR256:$dst),
   5382                     (ins i256mem:$src),
   5383                     !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
   5384                     [(set VR256:$dst,
   5385                       (vt (OpNode (bitconvert (loadv4i64 addr:$src)))))]>,
   5386                     Sched<[WriteVecALULd]>;
   5387 }
   5388 
   5389 // Helper fragments to match sext vXi1 to vXiY.
   5390 def v16i1sextv16i8 : PatLeaf<(v16i8 (X86pcmpgt (bc_v16i8 (v4i32 immAllZerosV)),
   5391                                                VR128:$src))>;
   5392 def v8i1sextv8i16  : PatLeaf<(v8i16 (X86vsrai VR128:$src, (i8 15)))>;
   5393 def v4i1sextv4i32  : PatLeaf<(v4i32 (X86vsrai VR128:$src, (i8 31)))>;
   5394 def v32i1sextv32i8 : PatLeaf<(v32i8 (X86pcmpgt (bc_v32i8 (v8i32 immAllZerosV)),
   5395                                                VR256:$src))>;
   5396 def v16i1sextv16i16: PatLeaf<(v16i16 (X86vsrai VR256:$src, (i8 15)))>;
   5397 def v8i1sextv8i32  : PatLeaf<(v8i32 (X86vsrai VR256:$src, (i8 31)))>;
   5398 
   5399 let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
   5400   defm VPABSB  : SS3I_unop_rm<0x1C, "vpabsb", v16i8, X86Abs, loadv2i64>, VEX;
   5401   defm VPABSW  : SS3I_unop_rm<0x1D, "vpabsw", v8i16, X86Abs, loadv2i64>, VEX;
   5402 }
   5403 let Predicates = [HasAVX, NoVLX] in {
   5404   defm VPABSD  : SS3I_unop_rm<0x1E, "vpabsd", v4i32, X86Abs, loadv2i64>, VEX;
   5405 }
   5406 
   5407 let Predicates = [HasAVX] in {
   5408   def : Pat<(xor
   5409             (bc_v2i64 (v16i1sextv16i8)),
   5410             (bc_v2i64 (add (v16i8 VR128:$src), (v16i1sextv16i8)))),
   5411             (VPABSBrr128 VR128:$src)>;
   5412   def : Pat<(xor
   5413             (bc_v2i64 (v8i1sextv8i16)),
   5414             (bc_v2i64 (add (v8i16 VR128:$src), (v8i1sextv8i16)))),
   5415             (VPABSWrr128 VR128:$src)>;
   5416   def : Pat<(xor
   5417             (bc_v2i64 (v4i1sextv4i32)),
   5418             (bc_v2i64 (add (v4i32 VR128:$src), (v4i1sextv4i32)))),
   5419             (VPABSDrr128 VR128:$src)>;
   5420 }
   5421 
   5422 let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
   5423   defm VPABSB  : SS3I_unop_rm_y<0x1C, "vpabsb", v32i8, X86Abs>, VEX, VEX_L;
   5424   defm VPABSW  : SS3I_unop_rm_y<0x1D, "vpabsw", v16i16, X86Abs>, VEX, VEX_L;
   5425 }
   5426 let Predicates = [HasAVX2, NoVLX] in {
   5427   defm VPABSD  : SS3I_unop_rm_y<0x1E, "vpabsd", v8i32, X86Abs>, VEX, VEX_L;
   5428 }
   5429 
   5430 let Predicates = [HasAVX2] in {
   5431   def : Pat<(xor
   5432             (bc_v4i64 (v32i1sextv32i8)),
   5433             (bc_v4i64 (add (v32i8 VR256:$src), (v32i1sextv32i8)))),
   5434             (VPABSBrr256 VR256:$src)>;
   5435   def : Pat<(xor
   5436             (bc_v4i64 (v16i1sextv16i16)),
   5437             (bc_v4i64 (add (v16i16 VR256:$src), (v16i1sextv16i16)))),
   5438             (VPABSWrr256 VR256:$src)>;
   5439   def : Pat<(xor
   5440             (bc_v4i64 (v8i1sextv8i32)),
   5441             (bc_v4i64 (add (v8i32 VR256:$src), (v8i1sextv8i32)))),
   5442             (VPABSDrr256 VR256:$src)>;
   5443 }
   5444 
   5445 defm PABSB : SS3I_unop_rm<0x1C, "pabsb", v16i8, X86Abs, memopv2i64>;
   5446 defm PABSW : SS3I_unop_rm<0x1D, "pabsw", v8i16, X86Abs, memopv2i64>;
   5447 defm PABSD : SS3I_unop_rm<0x1E, "pabsd", v4i32, X86Abs, memopv2i64>;
   5448 
   5449 let Predicates = [UseSSSE3] in {
   5450   def : Pat<(xor
   5451             (bc_v2i64 (v16i1sextv16i8)),
   5452             (bc_v2i64 (add (v16i8 VR128:$src), (v16i1sextv16i8)))),
   5453             (PABSBrr128 VR128:$src)>;
   5454   def : Pat<(xor
   5455             (bc_v2i64 (v8i1sextv8i16)),
   5456             (bc_v2i64 (add (v8i16 VR128:$src), (v8i1sextv8i16)))),
   5457             (PABSWrr128 VR128:$src)>;
   5458   def : Pat<(xor
   5459             (bc_v2i64 (v4i1sextv4i32)),
   5460             (bc_v2i64 (add (v4i32 VR128:$src), (v4i1sextv4i32)))),
   5461             (PABSDrr128 VR128:$src)>;
   5462 }
   5463 
   5464 //===---------------------------------------------------------------------===//
   5465 // SSSE3 - Packed Binary Operator Instructions
   5466 //===---------------------------------------------------------------------===//
   5467 
   5468 let Sched = WriteVecALU in {
   5469 def SSE_PHADDSUBD : OpndItins<
   5470   IIC_SSE_PHADDSUBD_RR, IIC_SSE_PHADDSUBD_RM
   5471 >;
   5472 def SSE_PHADDSUBSW : OpndItins<
   5473   IIC_SSE_PHADDSUBSW_RR, IIC_SSE_PHADDSUBSW_RM
   5474 >;
   5475 def SSE_PHADDSUBW : OpndItins<
   5476   IIC_SSE_PHADDSUBW_RR, IIC_SSE_PHADDSUBW_RM
   5477 >;
   5478 }
   5479 let Sched = WriteShuffle in
   5480 def SSE_PSHUFB : OpndItins<
   5481   IIC_SSE_PSHUFB_RR, IIC_SSE_PSHUFB_RM
   5482 >;
   5483 let Sched = WriteVecALU in
   5484 def SSE_PSIGN : OpndItins<
   5485   IIC_SSE_PSIGN_RR, IIC_SSE_PSIGN_RM
   5486 >;
   5487 let Sched = WriteVecIMul in
   5488 def SSE_PMULHRSW : OpndItins<
   5489   IIC_SSE_PMULHRSW, IIC_SSE_PMULHRSW
   5490 >;
   5491 
   5492 /// SS3I_binop_rm - Simple SSSE3 bin op
   5493 multiclass SS3I_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
   5494                          ValueType OpVT, RegisterClass RC, PatFrag memop_frag,
   5495                          X86MemOperand x86memop, OpndItins itins,
   5496                          bit Is2Addr = 1> {
   5497   let isCommutable = 1 in
   5498   def rr : SS38I<opc, MRMSrcReg, (outs RC:$dst),
   5499        (ins RC:$src1, RC:$src2),
   5500        !if(Is2Addr,
   5501          !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
   5502          !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
   5503        [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2)))], itins.rr>,
   5504        Sched<[itins.Sched]>;
   5505   def rm : SS38I<opc, MRMSrcMem, (outs RC:$dst),
   5506        (ins RC:$src1, x86memop:$src2),
   5507        !if(Is2Addr,
   5508          !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
   5509          !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
   5510        [(set RC:$dst,
   5511          (OpVT (OpNode RC:$src1,
   5512           (bitconvert (memop_frag addr:$src2)))))], itins.rm>,
   5513        Sched<[itins.Sched.Folded, ReadAfterLd]>;
   5514 }
   5515 
   5516 /// SS3I_binop_rm_int - Simple SSSE3 bin op whose type can be v*{i8,i16,i32}.
   5517 multiclass SS3I_binop_rm_int<bits<8> opc, string OpcodeStr,
   5518                              Intrinsic IntId128, OpndItins itins,
   5519                              PatFrag ld_frag, bit Is2Addr = 1> {
   5520   let isCommutable = 1 in
   5521   def rr128 : SS38I<opc, MRMSrcReg, (outs VR128:$dst),
   5522        (ins VR128:$src1, VR128:$src2),
   5523        !if(Is2Addr,
   5524          !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
   5525          !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
   5526        [(set VR128:$dst, (IntId128 VR128:$src1, VR128:$src2))]>,
   5527        Sched<[itins.Sched]>;
   5528   def rm128 : SS38I<opc, MRMSrcMem, (outs VR128:$dst),
   5529        (ins VR128:$src1, i128mem:$src2),
   5530        !if(Is2Addr,
   5531          !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
   5532          !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
   5533        [(set VR128:$dst,
   5534          (IntId128 VR128:$src1,
   5535           (bitconvert (ld_frag addr:$src2))))]>,
   5536        Sched<[itins.Sched.Folded, ReadAfterLd]>;
   5537 }
   5538 
   5539 multiclass SS3I_binop_rm_int_y<bits<8> opc, string OpcodeStr,
   5540                                Intrinsic IntId256,
   5541                                X86FoldableSchedWrite Sched> {
   5542   let isCommutable = 1 in
   5543   def rr256 : SS38I<opc, MRMSrcReg, (outs VR256:$dst),
   5544        (ins VR256:$src1, VR256:$src2),
   5545        !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
   5546        [(set VR256:$dst, (IntId256 VR256:$src1, VR256:$src2))]>,
   5547        Sched<[Sched]>;
   5548   def rm256 : SS38I<opc, MRMSrcMem, (outs VR256:$dst),
   5549        (ins VR256:$src1, i256mem:$src2),
   5550        !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
   5551        [(set VR256:$dst,
   5552          (IntId256 VR256:$src1, (bitconvert (loadv4i64 addr:$src2))))]>,
   5553        Sched<[Sched.Folded, ReadAfterLd]>;
   5554 }
   5555 
   5556 let ImmT = NoImm, Predicates = [HasAVX] in {
   5557 let isCommutable = 0 in {
   5558   defm VPHADDW    : SS3I_binop_rm<0x01, "vphaddw", X86hadd, v8i16, VR128,
   5559                                   loadv2i64, i128mem,
   5560                                   SSE_PHADDSUBW, 0>, VEX_4V;
   5561   defm VPHADDD    : SS3I_binop_rm<0x02, "vphaddd", X86hadd, v4i32, VR128,
   5562                                   loadv2i64, i128mem,
   5563                                   SSE_PHADDSUBD, 0>, VEX_4V;
   5564   defm VPHSUBW    : SS3I_binop_rm<0x05, "vphsubw", X86hsub, v8i16, VR128,
   5565                                   loadv2i64, i128mem,
   5566                                   SSE_PHADDSUBW, 0>, VEX_4V;
   5567   defm VPHSUBD    : SS3I_binop_rm<0x06, "vphsubd", X86hsub, v4i32, VR128,
   5568                                   loadv2i64, i128mem,
   5569                                   SSE_PHADDSUBD, 0>, VEX_4V;
   5570   defm VPSIGNB    : SS3I_binop_rm_int<0x08, "vpsignb",
   5571                                       int_x86_ssse3_psign_b_128,
   5572                                       SSE_PSIGN, loadv2i64, 0>, VEX_4V;
   5573   defm VPSIGNW    : SS3I_binop_rm_int<0x09, "vpsignw",
   5574                                       int_x86_ssse3_psign_w_128,
   5575                                       SSE_PSIGN, loadv2i64, 0>, VEX_4V;
   5576   defm VPSIGND    : SS3I_binop_rm_int<0x0A, "vpsignd",
   5577                                       int_x86_ssse3_psign_d_128,
   5578                                       SSE_PSIGN, loadv2i64, 0>, VEX_4V;
   5579   defm VPSHUFB    : SS3I_binop_rm<0x00, "vpshufb", X86pshufb, v16i8, VR128,
   5580                                   loadv2i64, i128mem,
   5581                                   SSE_PSHUFB, 0>, VEX_4V;
   5582   defm VPHADDSW   : SS3I_binop_rm_int<0x03, "vphaddsw",
   5583                                       int_x86_ssse3_phadd_sw_128,
   5584                                       SSE_PHADDSUBSW, loadv2i64, 0>, VEX_4V;
   5585   defm VPHSUBSW   : SS3I_binop_rm_int<0x07, "vphsubsw",
   5586                                       int_x86_ssse3_phsub_sw_128,
   5587                                       SSE_PHADDSUBSW, loadv2i64, 0>, VEX_4V;
   5588   defm VPMADDUBSW : SS3I_binop_rm_int<0x04, "vpmaddubsw",
   5589                                       int_x86_ssse3_pmadd_ub_sw_128,
   5590                                       SSE_PMADD, loadv2i64, 0>, VEX_4V;
   5591 }
   5592 defm VPMULHRSW    : SS3I_binop_rm_int<0x0B, "vpmulhrsw",
   5593                                       int_x86_ssse3_pmul_hr_sw_128,
   5594                                       SSE_PMULHRSW, loadv2i64, 0>, VEX_4V;
   5595 }
   5596 
   5597 let ImmT = NoImm, Predicates = [HasAVX2] in {
   5598 let isCommutable = 0 in {
   5599   defm VPHADDWY   : SS3I_binop_rm<0x01, "vphaddw", X86hadd, v16i16, VR256,
   5600                                   loadv4i64, i256mem,
   5601                                   SSE_PHADDSUBW, 0>, VEX_4V, VEX_L;
   5602   defm VPHADDDY   : SS3I_binop_rm<0x02, "vphaddd", X86hadd, v8i32, VR256,
   5603                                   loadv4i64, i256mem,
   5604                                   SSE_PHADDSUBW, 0>, VEX_4V, VEX_L;
   5605   defm VPHSUBWY   : SS3I_binop_rm<0x05, "vphsubw", X86hsub, v16i16, VR256,
   5606                                   loadv4i64, i256mem,
   5607                                   SSE_PHADDSUBW, 0>, VEX_4V, VEX_L;
   5608   defm VPHSUBDY   : SS3I_binop_rm<0x06, "vphsubd", X86hsub, v8i32, VR256,
   5609                                   loadv4i64, i256mem,
   5610                                   SSE_PHADDSUBW, 0>, VEX_4V, VEX_L;
   5611   defm VPSIGNBY   : SS3I_binop_rm_int_y<0x08, "vpsignb", int_x86_avx2_psign_b,
   5612                                         WriteVecALU>, VEX_4V, VEX_L;
   5613   defm VPSIGNWY   : SS3I_binop_rm_int_y<0x09, "vpsignw", int_x86_avx2_psign_w,
   5614                                         WriteVecALU>, VEX_4V, VEX_L;
   5615   defm VPSIGNDY   : SS3I_binop_rm_int_y<0x0A, "vpsignd", int_x86_avx2_psign_d,
   5616                                         WriteVecALU>, VEX_4V, VEX_L;
   5617   defm VPSHUFBY   : SS3I_binop_rm<0x00, "vpshufb", X86pshufb, v32i8, VR256,
   5618                                   loadv4i64, i256mem,
   5619                                   SSE_PSHUFB, 0>, VEX_4V, VEX_L;
   5620   defm VPHADDSW   : SS3I_binop_rm_int_y<0x03, "vphaddsw",
   5621                                         int_x86_avx2_phadd_sw,
   5622                                         WriteVecALU>, VEX_4V, VEX_L;
   5623   defm VPHSUBSW   : SS3I_binop_rm_int_y<0x07, "vphsubsw",
   5624                                         int_x86_avx2_phsub_sw,
   5625                                         WriteVecALU>, VEX_4V, VEX_L;
   5626   defm VPMADDUBSW : SS3I_binop_rm_int_y<0x04, "vpmaddubsw",
   5627                                        int_x86_avx2_pmadd_ub_sw,
   5628                                         WriteVecIMul>, VEX_4V, VEX_L;
   5629 }
   5630 defm VPMULHRSW    : SS3I_binop_rm_int_y<0x0B, "vpmulhrsw",
   5631                                         int_x86_avx2_pmul_hr_sw,
   5632                                         WriteVecIMul>, VEX_4V, VEX_L;
   5633 }
   5634 
   5635 // None of these have i8 immediate fields.
   5636 let ImmT = NoImm, Constraints = "$src1 = $dst" in {
   5637 let isCommutable = 0 in {
   5638   defm PHADDW    : SS3I_binop_rm<0x01, "phaddw", X86hadd, v8i16, VR128,
   5639                                  memopv2i64, i128mem, SSE_PHADDSUBW>;
   5640   defm PHADDD    : SS3I_binop_rm<0x02, "phaddd", X86hadd, v4i32, VR128,
   5641                                  memopv2i64, i128mem, SSE_PHADDSUBD>;
   5642   defm PHSUBW    : SS3I_binop_rm<0x05, "phsubw", X86hsub, v8i16, VR128,
   5643                                  memopv2i64, i128mem, SSE_PHADDSUBW>;
   5644   defm PHSUBD    : SS3I_binop_rm<0x06, "phsubd", X86hsub, v4i32, VR128,
   5645                                  memopv2i64, i128mem, SSE_PHADDSUBD>;
   5646   defm PSIGNB    : SS3I_binop_rm_int<0x08, "psignb", int_x86_ssse3_psign_b_128,
   5647                                      SSE_PSIGN, memopv2i64>;
   5648   defm PSIGNW    : SS3I_binop_rm_int<0x09, "psignw", int_x86_ssse3_psign_w_128,
   5649                                      SSE_PSIGN, memopv2i64>;
   5650   defm PSIGND    : SS3I_binop_rm_int<0x0A, "psignd", int_x86_ssse3_psign_d_128,
   5651                                      SSE_PSIGN, memopv2i64>;
   5652   defm PSHUFB    : SS3I_binop_rm<0x00, "pshufb", X86pshufb, v16i8, VR128,
   5653                                  memopv2i64, i128mem, SSE_PSHUFB>;
   5654   defm PHADDSW   : SS3I_binop_rm_int<0x03, "phaddsw",
   5655                                      int_x86_ssse3_phadd_sw_128,
   5656                                      SSE_PHADDSUBSW, memopv2i64>;
   5657   defm PHSUBSW   : SS3I_binop_rm_int<0x07, "phsubsw",
   5658                                      int_x86_ssse3_phsub_sw_128,
   5659                                      SSE_PHADDSUBSW, memopv2i64>;
   5660   defm PMADDUBSW : SS3I_binop_rm_int<0x04, "pmaddubsw",
   5661                                      int_x86_ssse3_pmadd_ub_sw_128,
   5662                                      SSE_PMADD, memopv2i64>;
   5663 }
   5664 defm PMULHRSW    : SS3I_binop_rm_int<0x0B, "pmulhrsw",
   5665                                      int_x86_ssse3_pmul_hr_sw_128,
   5666                                      SSE_PMULHRSW, memopv2i64>;
   5667 }
   5668 
   5669 //===---------------------------------------------------------------------===//
   5670 // SSSE3 - Packed Align Instruction Patterns
   5671 //===---------------------------------------------------------------------===//
   5672 
   5673 multiclass ssse3_palignr<string asm, bit Is2Addr = 1> {
   5674   let hasSideEffects = 0 in {
   5675   def rri : SS3AI<0x0F, MRMSrcReg, (outs VR128:$dst),
   5676       (ins VR128:$src1, VR128:$src2, u8imm:$src3),
   5677       !if(Is2Addr,
   5678         !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
   5679         !strconcat(asm,
   5680                   "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
   5681       [], IIC_SSE_PALIGNRR>, Sched<[WriteShuffle]>;
   5682   let mayLoad = 1 in
   5683   def rmi : SS3AI<0x0F, MRMSrcMem, (outs VR128:$dst),
   5684       (ins VR128:$src1, i128mem:$src2, u8imm:$src3),
   5685       !if(Is2Addr,
   5686         !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
   5687         !strconcat(asm,
   5688                   "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
   5689       [], IIC_SSE_PALIGNRM>, Sched<[WriteShuffleLd, ReadAfterLd]>;
   5690   }
   5691 }
   5692 
   5693 multiclass ssse3_palignr_y<string asm, bit Is2Addr = 1> {
   5694   let hasSideEffects = 0 in {
   5695   def Yrri : SS3AI<0x0F, MRMSrcReg, (outs VR256:$dst),
   5696       (ins VR256:$src1, VR256:$src2, u8imm:$src3),
   5697       !strconcat(asm,
   5698                  "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
   5699       []>, Sched<[WriteShuffle]>;
   5700   let mayLoad = 1 in
   5701   def Yrmi : SS3AI<0x0F, MRMSrcMem, (outs VR256:$dst),
   5702       (ins VR256:$src1, i256mem:$src2, u8imm:$src3),
   5703       !strconcat(asm,
   5704                  "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
   5705       []>, Sched<[WriteShuffleLd, ReadAfterLd]>;
   5706   }
   5707 }
   5708 
   5709 let Predicates = [HasAVX] in
   5710   defm VPALIGNR : ssse3_palignr<"vpalignr", 0>, VEX_4V;
   5711 let Predicates = [HasAVX2] in
   5712   defm VPALIGNR : ssse3_palignr_y<"vpalignr", 0>, VEX_4V, VEX_L;
   5713 let Constraints = "$src1 = $dst", Predicates = [UseSSSE3] in
   5714   defm PALIGNR : ssse3_palignr<"palignr">;
   5715 
   5716 let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
   5717 def : Pat<(v8i32 (X86PAlignr VR256:$src1, VR256:$src2, (i8 imm:$imm))),
   5718           (VPALIGNRYrri VR256:$src1, VR256:$src2, imm:$imm)>;
   5719 def : Pat<(v8f32 (X86PAlignr VR256:$src1, VR256:$src2, (i8 imm:$imm))),
   5720           (VPALIGNRYrri VR256:$src1, VR256:$src2, imm:$imm)>;
   5721 def : Pat<(v16i16 (X86PAlignr VR256:$src1, VR256:$src2, (i8 imm:$imm))),
   5722           (VPALIGNRYrri VR256:$src1, VR256:$src2, imm:$imm)>;
   5723 def : Pat<(v32i8 (X86PAlignr VR256:$src1, VR256:$src2, (i8 imm:$imm))),
   5724           (VPALIGNRYrri VR256:$src1, VR256:$src2, imm:$imm)>;
   5725 }
   5726 
   5727 let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
   5728 def : Pat<(v4i32 (X86PAlignr VR128:$src1, VR128:$src2, (i8 imm:$imm))),
   5729           (VPALIGNRrri VR128:$src1, VR128:$src2, imm:$imm)>;
   5730 def : Pat<(v4f32 (X86PAlignr VR128:$src1, VR128:$src2, (i8 imm:$imm))),
   5731           (VPALIGNRrri VR128:$src1, VR128:$src2, imm:$imm)>;
   5732 def : Pat<(v8i16 (X86PAlignr VR128:$src1, VR128:$src2, (i8 imm:$imm))),
   5733           (VPALIGNRrri VR128:$src1, VR128:$src2, imm:$imm)>;
   5734 def : Pat<(v16i8 (X86PAlignr VR128:$src1, VR128:$src2, (i8 imm:$imm))),
   5735           (VPALIGNRrri VR128:$src1, VR128:$src2, imm:$imm)>;
   5736 }
   5737 
   5738 let Predicates = [UseSSSE3] in {
   5739 def : Pat<(v4i32 (X86PAlignr VR128:$src1, VR128:$src2, (i8 imm:$imm))),
   5740           (PALIGNRrri VR128:$src1, VR128:$src2, imm:$imm)>;
   5741 def : Pat<(v4f32 (X86PAlignr VR128:$src1, VR128:$src2, (i8 imm:$imm))),
   5742           (PALIGNRrri VR128:$src1, VR128:$src2, imm:$imm)>;
   5743 def : Pat<(v8i16 (X86PAlignr VR128:$src1, VR128:$src2, (i8 imm:$imm))),
   5744           (PALIGNRrri VR128:$src1, VR128:$src2, imm:$imm)>;
   5745 def : Pat<(v16i8 (X86PAlignr VR128:$src1, VR128:$src2, (i8 imm:$imm))),
   5746           (PALIGNRrri VR128:$src1, VR128:$src2, imm:$imm)>;
   5747 }
   5748 
   5749 //===---------------------------------------------------------------------===//
   5750 // SSSE3 - Thread synchronization
   5751 //===---------------------------------------------------------------------===//
   5752 
   5753 let SchedRW = [WriteSystem] in {
   5754 let usesCustomInserter = 1 in {
   5755 def MONITOR : PseudoI<(outs), (ins i32mem:$src1, GR32:$src2, GR32:$src3),
   5756                 [(int_x86_sse3_monitor addr:$src1, GR32:$src2, GR32:$src3)]>,
   5757                 Requires<[HasSSE3]>;
   5758 }
   5759 
   5760 let Uses = [EAX, ECX, EDX] in
   5761 def MONITORrrr : I<0x01, MRM_C8, (outs), (ins), "monitor", [], IIC_SSE_MONITOR>,
   5762                  TB, Requires<[HasSSE3]>;
   5763 
   5764 let Uses = [ECX, EAX] in
   5765 def MWAITrr   : I<0x01, MRM_C9, (outs), (ins), "mwait",
   5766                 [(int_x86_sse3_mwait ECX, EAX)], IIC_SSE_MWAIT>,
   5767                 TB, Requires<[HasSSE3]>;
   5768 } // SchedRW
   5769 
   5770 def : InstAlias<"mwait\t{%eax, %ecx|ecx, eax}", (MWAITrr)>, Requires<[Not64BitMode]>;
   5771 def : InstAlias<"mwait\t{%rax, %rcx|rcx, rax}", (MWAITrr)>, Requires<[In64BitMode]>;
   5772 
   5773 def : InstAlias<"monitor\t{%eax, %ecx, %edx|edx, ecx, eax}", (MONITORrrr)>,
   5774       Requires<[Not64BitMode]>;
   5775 def : InstAlias<"monitor\t{%rax, %rcx, %rdx|rdx, rcx, rax}", (MONITORrrr)>,
   5776       Requires<[In64BitMode]>;
   5777 
   5778 //===----------------------------------------------------------------------===//
   5779 // SSE4.1 - Packed Move with Sign/Zero Extend
   5780 //===----------------------------------------------------------------------===//
   5781 
   5782 multiclass SS41I_pmovx_rrrm<bits<8> opc, string OpcodeStr, X86MemOperand MemOp,
   5783                           RegisterClass OutRC, RegisterClass InRC,
   5784                           OpndItins itins> {
   5785   def rr : SS48I<opc, MRMSrcReg, (outs OutRC:$dst), (ins InRC:$src),
   5786                  !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
   5787                  [], itins.rr>,
   5788                  Sched<[itins.Sched]>;
   5789 
   5790   def rm : SS48I<opc, MRMSrcMem, (outs OutRC:$dst), (ins MemOp:$src),
   5791                  !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
   5792                  [],
   5793                  itins.rm>, Sched<[itins.Sched.Folded]>;
   5794 }
   5795 
   5796 multiclass SS41I_pmovx_rm_all<bits<8> opc, string OpcodeStr,
   5797                           X86MemOperand MemOp, X86MemOperand MemYOp,
   5798                           OpndItins SSEItins, OpndItins AVXItins,
   5799                           OpndItins AVX2Itins, Predicate prd> {
   5800   defm NAME : SS41I_pmovx_rrrm<opc, OpcodeStr, MemOp, VR128, VR128, SSEItins>;
   5801   let Predicates = [HasAVX, prd] in
   5802     defm V#NAME   : SS41I_pmovx_rrrm<opc, !strconcat("v", OpcodeStr), MemOp,
   5803                                      VR128, VR128, AVXItins>, VEX;
   5804   let Predicates = [HasAVX2, prd] in
   5805     defm V#NAME#Y : SS41I_pmovx_rrrm<opc, !strconcat("v", OpcodeStr), MemYOp,
   5806                                      VR256, VR128, AVX2Itins>, VEX, VEX_L;
   5807 }
   5808 
   5809 multiclass SS41I_pmovx_rm<bits<8> opc, string OpcodeStr, X86MemOperand MemOp,
   5810                           X86MemOperand MemYOp, Predicate prd> {
   5811   defm PMOVSX#NAME : SS41I_pmovx_rm_all<opc, !strconcat("pmovsx", OpcodeStr),
   5812                                         MemOp, MemYOp,
   5813                                         SSE_INTALU_ITINS_SHUFF_P,
   5814                                         DEFAULT_ITINS_SHUFFLESCHED,
   5815                                         DEFAULT_ITINS_SHUFFLESCHED, prd>;
   5816   defm PMOVZX#NAME : SS41I_pmovx_rm_all<!add(opc, 0x10),
   5817                                         !strconcat("pmovzx", OpcodeStr),
   5818                                         MemOp, MemYOp,
   5819                                         SSE_INTALU_ITINS_SHUFF_P,
   5820                                         DEFAULT_ITINS_SHUFFLESCHED,
   5821                                         DEFAULT_ITINS_SHUFFLESCHED, prd>;
   5822 }
   5823 
   5824 defm BW : SS41I_pmovx_rm<0x20, "bw", i64mem, i128mem, NoVLX_Or_NoBWI>;
   5825 defm WD : SS41I_pmovx_rm<0x23, "wd", i64mem, i128mem, NoVLX>;
   5826 defm DQ : SS41I_pmovx_rm<0x25, "dq", i64mem, i128mem, NoVLX>;
   5827 
   5828 defm BD : SS41I_pmovx_rm<0x21, "bd", i32mem, i64mem, NoVLX>;
   5829 defm WQ : SS41I_pmovx_rm<0x24, "wq", i32mem, i64mem, NoVLX>;
   5830 
   5831 defm BQ : SS41I_pmovx_rm<0x22, "bq", i16mem, i32mem, NoVLX>;
   5832 
   5833 // AVX2 Patterns
   5834 multiclass SS41I_pmovx_avx2_patterns<string OpcPrefix, string ExtTy, SDNode ExtOp> {
   5835   // Register-Register patterns
   5836   let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
   5837   def : Pat<(v16i16 (ExtOp (v16i8 VR128:$src))),
   5838             (!cast<I>(OpcPrefix#BWYrr) VR128:$src)>;
   5839   }
   5840   let Predicates = [HasAVX, NoVLX] in {
   5841   def : Pat<(v8i32 (ExtOp (v16i8 VR128:$src))),
   5842             (!cast<I>(OpcPrefix#BDYrr) VR128:$src)>;
   5843   def : Pat<(v4i64 (ExtOp (v16i8 VR128:$src))),
   5844             (!cast<I>(OpcPrefix#BQYrr) VR128:$src)>;
   5845 
   5846   def : Pat<(v8i32 (ExtOp (v8i16 VR128:$src))),
   5847             (!cast<I>(OpcPrefix#WDYrr) VR128:$src)>;
   5848   def : Pat<(v4i64 (ExtOp (v8i16 VR128:$src))),
   5849             (!cast<I>(OpcPrefix#WQYrr) VR128:$src)>;
   5850 
   5851   def : Pat<(v4i64 (ExtOp (v4i32 VR128:$src))),
   5852             (!cast<I>(OpcPrefix#DQYrr) VR128:$src)>;
   5853   }
   5854 
   5855   // Simple Register-Memory patterns
   5856   let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
   5857   def : Pat<(v16i16 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)),
   5858             (!cast<I>(OpcPrefix#BWYrm) addr:$src)>;
   5859   }
   5860   let Predicates = [HasAVX, NoVLX] in {
   5861   def : Pat<(v8i32 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)),
   5862             (!cast<I>(OpcPrefix#BDYrm) addr:$src)>;
   5863   def : Pat<(v4i64 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)),
   5864             (!cast<I>(OpcPrefix#BQYrm) addr:$src)>;
   5865 
   5866   def : Pat<(v8i32 (!cast<PatFrag>(ExtTy#"extloadvi16") addr:$src)),
   5867             (!cast<I>(OpcPrefix#WDYrm) addr:$src)>;
   5868   def : Pat<(v4i64 (!cast<PatFrag>(ExtTy#"extloadvi16") addr:$src)),
   5869             (!cast<I>(OpcPrefix#WQYrm) addr:$src)>;
   5870 
   5871   def : Pat<(v4i64 (!cast<PatFrag>(ExtTy#"extloadvi32") addr:$src)),
   5872             (!cast<I>(OpcPrefix#DQYrm) addr:$src)>;
   5873   }
   5874 
   5875   // AVX2 Register-Memory patterns
   5876   let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
   5877   def : Pat<(v16i16 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))),
   5878             (!cast<I>(OpcPrefix#BWYrm) addr:$src)>;
   5879   def : Pat<(v16i16 (ExtOp (v16i8 (vzmovl_v2i64 addr:$src)))),
   5880             (!cast<I>(OpcPrefix#BWYrm) addr:$src)>;
   5881   def : Pat<(v16i16 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))),
   5882             (!cast<I>(OpcPrefix#BWYrm) addr:$src)>;
   5883   def : Pat<(v16i16 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))),
   5884             (!cast<I>(OpcPrefix#BWYrm) addr:$src)>;
   5885   }
   5886   let Predicates = [HasAVX, NoVLX] in {
   5887   def : Pat<(v8i32 (ExtOp (bc_v16i8 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
   5888             (!cast<I>(OpcPrefix#BDYrm) addr:$src)>;
   5889   def : Pat<(v8i32 (ExtOp (v16i8 (vzmovl_v2i64 addr:$src)))),
   5890             (!cast<I>(OpcPrefix#BDYrm) addr:$src)>;
   5891   def : Pat<(v8i32 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))),
   5892             (!cast<I>(OpcPrefix#BDYrm) addr:$src)>;
   5893   def : Pat<(v8i32 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))),
   5894             (!cast<I>(OpcPrefix#BDYrm) addr:$src)>;
   5895 
   5896   def : Pat<(v4i64 (ExtOp (bc_v16i8 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))),
   5897             (!cast<I>(OpcPrefix#BQYrm) addr:$src)>;
   5898   def : Pat<(v4i64 (ExtOp (v16i8 (vzmovl_v4i32 addr:$src)))),
   5899             (!cast<I>(OpcPrefix#BQYrm) addr:$src)>;
   5900   def : Pat<(v4i64 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))),
   5901             (!cast<I>(OpcPrefix#BQYrm) addr:$src)>;
   5902   def : Pat<(v4i64 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))),
   5903             (!cast<I>(OpcPrefix#BQYrm) addr:$src)>;
   5904 
   5905   def : Pat<(v8i32 (ExtOp (bc_v8i16 (loadv2i64 addr:$src)))),
   5906             (!cast<I>(OpcPrefix#WDYrm) addr:$src)>;
   5907   def : Pat<(v8i32 (ExtOp (v8i16 (vzmovl_v2i64 addr:$src)))),
   5908             (!cast<I>(OpcPrefix#WDYrm) addr:$src)>;
   5909   def : Pat<(v8i32 (ExtOp (v8i16 (vzload_v2i64 addr:$src)))),
   5910             (!cast<I>(OpcPrefix#WDYrm) addr:$src)>;
   5911   def : Pat<(v8i32 (ExtOp (bc_v8i16 (loadv2i64 addr:$src)))),
   5912             (!cast<I>(OpcPrefix#WDYrm) addr:$src)>;
   5913 
   5914   def : Pat<(v4i64 (ExtOp (bc_v8i16 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
   5915             (!cast<I>(OpcPrefix#WQYrm) addr:$src)>;
   5916   def : Pat<(v4i64 (ExtOp (v8i16 (vzmovl_v2i64 addr:$src)))),
   5917             (!cast<I>(OpcPrefix#WQYrm) addr:$src)>;
   5918   def : Pat<(v4i64 (ExtOp (v8i16 (vzload_v2i64 addr:$src)))),
   5919             (!cast<I>(OpcPrefix#WQYrm) addr:$src)>;
   5920   def : Pat<(v4i64 (ExtOp (bc_v8i16 (loadv2i64 addr:$src)))),
   5921             (!cast<I>(OpcPrefix#WQYrm) addr:$src)>;
   5922 
   5923   def : Pat<(v4i64 (ExtOp (bc_v4i32 (loadv2i64 addr:$src)))),
   5924             (!cast<I>(OpcPrefix#DQYrm) addr:$src)>;
   5925   def : Pat<(v4i64 (ExtOp (v4i32 (vzmovl_v2i64 addr:$src)))),
   5926             (!cast<I>(OpcPrefix#DQYrm) addr:$src)>;
   5927   def : Pat<(v4i64 (ExtOp (v4i32 (vzload_v2i64 addr:$src)))),
   5928             (!cast<I>(OpcPrefix#DQYrm) addr:$src)>;
   5929   def : Pat<(v4i64 (ExtOp (bc_v4i32 (loadv2i64 addr:$src)))),
   5930             (!cast<I>(OpcPrefix#DQYrm) addr:$src)>;
   5931   }
   5932 }
   5933 
   5934 defm : SS41I_pmovx_avx2_patterns<"VPMOVSX", "s", X86vsext>;
   5935 defm : SS41I_pmovx_avx2_patterns<"VPMOVZX", "z", X86vzext>;
   5936 
   5937 // SSE4.1/AVX patterns.
   5938 multiclass SS41I_pmovx_patterns<string OpcPrefix, string ExtTy,
   5939                                 SDNode ExtOp, PatFrag ExtLoad16> {
   5940   let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
   5941   def : Pat<(v8i16 (ExtOp (v16i8 VR128:$src))),
   5942             (!cast<I>(OpcPrefix#BWrr) VR128:$src)>;
   5943   }
   5944   let Predicates = [HasAVX, NoVLX] in {
   5945   def : Pat<(v4i32 (ExtOp (v16i8 VR128:$src))),
   5946             (!cast<I>(OpcPrefix#BDrr) VR128:$src)>;
   5947   def : Pat<(v2i64 (ExtOp (v16i8 VR128:$src))),
   5948             (!cast<I>(OpcPrefix#BQrr) VR128:$src)>;
   5949 
   5950   def : Pat<(v4i32 (ExtOp (v8i16 VR128:$src))),
   5951             (!cast<I>(OpcPrefix#WDrr) VR128:$src)>;
   5952   def : Pat<(v2i64 (ExtOp (v8i16 VR128:$src))),
   5953             (!cast<I>(OpcPrefix#WQrr) VR128:$src)>;
   5954 
   5955   def : Pat<(v2i64 (ExtOp (v4i32 VR128:$src))),
   5956             (!cast<I>(OpcPrefix#DQrr) VR128:$src)>;
   5957   }
   5958   let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
   5959   def : Pat<(v8i16 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)),
   5960             (!cast<I>(OpcPrefix#BWrm) addr:$src)>;
   5961   }
   5962   let Predicates = [HasAVX, NoVLX] in {
   5963   def : Pat<(v4i32 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)),
   5964             (!cast<I>(OpcPrefix#BDrm) addr:$src)>;
   5965   def : Pat<(v2i64 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)),
   5966             (!cast<I>(OpcPrefix#BQrm) addr:$src)>;
   5967 
   5968   def : Pat<(v4i32 (!cast<PatFrag>(ExtTy#"extloadvi16") addr:$src)),
   5969             (!cast<I>(OpcPrefix#WDrm) addr:$src)>;
   5970   def : Pat<(v2i64 (!cast<PatFrag>(ExtTy#"extloadvi16") addr:$src)),
   5971             (!cast<I>(OpcPrefix#WQrm) addr:$src)>;
   5972 
   5973   def : Pat<(v2i64 (!cast<PatFrag>(ExtTy#"extloadvi32") addr:$src)),
   5974             (!cast<I>(OpcPrefix#DQrm) addr:$src)>;
   5975   }
   5976   let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
   5977   def : Pat<(v8i16 (ExtOp (bc_v16i8 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
   5978             (!cast<I>(OpcPrefix#BWrm) addr:$src)>;
   5979   def : Pat<(v8i16 (ExtOp (bc_v16i8 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))),
   5980             (!cast<I>(OpcPrefix#BWrm) addr:$src)>;
   5981   def : Pat<(v8i16 (ExtOp (v16i8 (vzmovl_v2i64 addr:$src)))),
   5982             (!cast<I>(OpcPrefix#BWrm) addr:$src)>;
   5983   def : Pat<(v8i16 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))),
   5984             (!cast<I>(OpcPrefix#BWrm) addr:$src)>;
   5985   def : Pat<(v8i16 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))),
   5986             (!cast<I>(OpcPrefix#BWrm) addr:$src)>;
   5987   }
   5988   let Predicates = [HasAVX, NoVLX] in {
   5989   def : Pat<(v4i32 (ExtOp (bc_v16i8 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))),
   5990             (!cast<I>(OpcPrefix#BDrm) addr:$src)>;
   5991   def : Pat<(v4i32 (ExtOp (v16i8 (vzmovl_v4i32 addr:$src)))),
   5992             (!cast<I>(OpcPrefix#BDrm) addr:$src)>;
   5993   def : Pat<(v4i32 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))),
   5994             (!cast<I>(OpcPrefix#BDrm) addr:$src)>;
   5995   def : Pat<(v4i32 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))),
   5996             (!cast<I>(OpcPrefix#BDrm) addr:$src)>;
   5997 
   5998   def : Pat<(v2i64 (ExtOp (bc_v16i8 (v4i32 (scalar_to_vector (ExtLoad16 addr:$src)))))),
   5999             (!cast<I>(OpcPrefix#BQrm) addr:$src)>;
   6000   def : Pat<(v2i64 (ExtOp (v16i8 (vzmovl_v4i32 addr:$src)))),
   6001             (!cast<I>(OpcPrefix#BQrm) addr:$src)>;
   6002   def : Pat<(v2i64 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))),
   6003             (!cast<I>(OpcPrefix#BQrm) addr:$src)>;
   6004   def : Pat<(v2i64 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))),
   6005             (!cast<I>(OpcPrefix#BQrm) addr:$src)>;
   6006 
   6007   def : Pat<(v4i32 (ExtOp (bc_v8i16 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
   6008             (!cast<I>(OpcPrefix#WDrm) addr:$src)>;
   6009   def : Pat<(v4i32 (ExtOp (bc_v8i16 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))),
   6010             (!cast<I>(OpcPrefix#WDrm) addr:$src)>;
   6011   def : Pat<(v4i32 (ExtOp (v8i16 (vzmovl_v2i64 addr:$src)))),
   6012             (!cast<I>(OpcPrefix#WDrm) addr:$src)>;
   6013   def : Pat<(v4i32 (ExtOp (v8i16 (vzload_v2i64 addr:$src)))),
   6014             (!cast<I>(OpcPrefix#WDrm) addr:$src)>;
   6015   def : Pat<(v4i32 (ExtOp (bc_v8i16 (loadv2i64 addr:$src)))),
   6016             (!cast<I>(OpcPrefix#WDrm) addr:$src)>;
   6017 
   6018   def : Pat<(v2i64 (ExtOp (bc_v8i16 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))),
   6019             (!cast<I>(OpcPrefix#WQrm) addr:$src)>;
   6020   def : Pat<(v2i64 (ExtOp (v8i16 (vzmovl_v4i32 addr:$src)))),
   6021             (!cast<I>(OpcPrefix#WQrm) addr:$src)>;
   6022   def : Pat<(v2i64 (ExtOp (v8i16 (vzload_v2i64 addr:$src)))),
   6023             (!cast<I>(OpcPrefix#WQrm) addr:$src)>;
   6024   def : Pat<(v2i64 (ExtOp (bc_v8i16 (loadv2i64 addr:$src)))),
   6025             (!cast<I>(OpcPrefix#WQrm) addr:$src)>;
   6026 
   6027   def : Pat<(v2i64 (ExtOp (bc_v4i32 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
   6028             (!cast<I>(OpcPrefix#DQrm) addr:$src)>;
   6029   def : Pat<(v2i64 (ExtOp (bc_v4i32 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))),
   6030             (!cast<I>(OpcPrefix#DQrm) addr:$src)>;
   6031   def : Pat<(v2i64 (ExtOp (v4i32 (vzmovl_v2i64 addr:$src)))),
   6032             (!cast<I>(OpcPrefix#DQrm) addr:$src)>;
   6033   def : Pat<(v2i64 (ExtOp (v4i32 (vzload_v2i64 addr:$src)))),
   6034             (!cast<I>(OpcPrefix#DQrm) addr:$src)>;
   6035   def : Pat<(v2i64 (ExtOp (bc_v4i32 (loadv2i64 addr:$src)))),
   6036             (!cast<I>(OpcPrefix#DQrm) addr:$src)>;
   6037   }
   6038 }
   6039 
   6040 defm : SS41I_pmovx_patterns<"VPMOVSX", "s", X86vsext, extloadi32i16>;
   6041 defm : SS41I_pmovx_patterns<"VPMOVZX", "z", X86vzext, loadi16_anyext>;
   6042 
   6043 let Predicates = [UseSSE41] in {
   6044   defm : SS41I_pmovx_patterns<"PMOVSX", "s", X86vsext, extloadi32i16>;
   6045   defm : SS41I_pmovx_patterns<"PMOVZX", "z", X86vzext, loadi16_anyext>;
   6046 }
   6047 
   6048 //===----------------------------------------------------------------------===//
   6049 // SSE4.1 - Extract Instructions
   6050 //===----------------------------------------------------------------------===//
   6051 
   6052 /// SS41I_binop_ext8 - SSE 4.1 extract 8 bits to 32 bit reg or 8 bit mem
   6053 multiclass SS41I_extract8<bits<8> opc, string OpcodeStr> {
   6054   def rr : SS4AIi8<opc, MRMDestReg, (outs GR32orGR64:$dst),
   6055                  (ins VR128:$src1, u8imm:$src2),
   6056                  !strconcat(OpcodeStr,
   6057                             "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
   6058                  [(set GR32orGR64:$dst, (X86pextrb (v16i8 VR128:$src1),
   6059                                          imm:$src2))]>,
   6060                   Sched<[WriteShuffle]>;
   6061   let hasSideEffects = 0, mayStore = 1,
   6062       SchedRW = [WriteShuffleLd, WriteRMW] in
   6063   def mr : SS4AIi8<opc, MRMDestMem, (outs),
   6064                  (ins i8mem:$dst, VR128:$src1, u8imm:$src2),
   6065                  !strconcat(OpcodeStr,
   6066                             "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
   6067                  [(store (i8 (trunc (assertzext (X86pextrb (v16i8 VR128:$src1),
   6068                                                  imm:$src2)))), addr:$dst)]>;
   6069 }
   6070 
   6071 let Predicates = [HasAVX, NoBWI] in
   6072   defm VPEXTRB : SS41I_extract8<0x14, "vpextrb">, VEX;
   6073 
   6074 defm PEXTRB      : SS41I_extract8<0x14, "pextrb">;
   6075 
   6076 
   6077 /// SS41I_extract16 - SSE 4.1 extract 16 bits to memory destination
   6078 multiclass SS41I_extract16<bits<8> opc, string OpcodeStr> {
   6079   let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in
   6080   def rr_REV : SS4AIi8<opc, MRMDestReg, (outs GR32orGR64:$dst),
   6081                    (ins VR128:$src1, u8imm:$src2),
   6082                    !strconcat(OpcodeStr,
   6083                    "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
   6084                    []>, Sched<[WriteShuffle]>;
   6085 
   6086   let hasSideEffects = 0, mayStore = 1,
   6087       SchedRW = [WriteShuffleLd, WriteRMW] in
   6088   def mr : SS4AIi8<opc, MRMDestMem, (outs),
   6089                  (ins i16mem:$dst, VR128:$src1, u8imm:$src2),
   6090                  !strconcat(OpcodeStr,
   6091                   "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
   6092                  [(store (i16 (trunc (assertzext (X86pextrw (v8i16 VR128:$src1),
   6093                                                   imm:$src2)))), addr:$dst)]>;
   6094 }
   6095 
   6096 let Predicates = [HasAVX, NoBWI] in
   6097   defm VPEXTRW : SS41I_extract16<0x15, "vpextrw">, VEX;
   6098 
   6099 defm PEXTRW      : SS41I_extract16<0x15, "pextrw">;
   6100 
   6101 
   6102 /// SS41I_extract32 - SSE 4.1 extract 32 bits to int reg or memory destination
   6103 multiclass SS41I_extract32<bits<8> opc, string OpcodeStr> {
   6104   def rr : SS4AIi8<opc, MRMDestReg, (outs GR32:$dst),
   6105                  (ins VR128:$src1, u8imm:$src2),
   6106                  !strconcat(OpcodeStr,
   6107                   "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
   6108                  [(set GR32:$dst,
   6109                   (extractelt (v4i32 VR128:$src1), imm:$src2))]>,
   6110                   Sched<[WriteShuffle]>;
   6111   let SchedRW = [WriteShuffleLd, WriteRMW] in
   6112   def mr : SS4AIi8<opc, MRMDestMem, (outs),
   6113                  (ins i32mem:$dst, VR128:$src1, u8imm:$src2),
   6114                  !strconcat(OpcodeStr,
   6115                   "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
   6116                  [(store (extractelt (v4i32 VR128:$src1), imm:$src2),
   6117                           addr:$dst)]>;
   6118 }
   6119 
   6120 let Predicates = [HasAVX, NoDQI] in
   6121   defm VPEXTRD : SS41I_extract32<0x16, "vpextrd">, VEX;
   6122 
   6123 defm PEXTRD      : SS41I_extract32<0x16, "pextrd">;
   6124 
   6125 /// SS41I_extract32 - SSE 4.1 extract 32 bits to int reg or memory destination
   6126 multiclass SS41I_extract64<bits<8> opc, string OpcodeStr> {
   6127   def rr : SS4AIi8<opc, MRMDestReg, (outs GR64:$dst),
   6128                  (ins VR128:$src1, u8imm:$src2),
   6129                  !strconcat(OpcodeStr,
   6130                   "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
   6131                  [(set GR64:$dst,
   6132                   (extractelt (v2i64 VR128:$src1), imm:$src2))]>,
   6133                   Sched<[WriteShuffle]>, REX_W;
   6134   let SchedRW = [WriteShuffleLd, WriteRMW] in
   6135   def mr : SS4AIi8<opc, MRMDestMem, (outs),
   6136                  (ins i64mem:$dst, VR128:$src1, u8imm:$src2),
   6137                  !strconcat(OpcodeStr,
   6138                   "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
   6139                  [(store (extractelt (v2i64 VR128:$src1), imm:$src2),
   6140                           addr:$dst)]>, REX_W;
   6141 }
   6142 
   6143 let Predicates = [HasAVX, NoDQI] in
   6144   defm VPEXTRQ : SS41I_extract64<0x16, "vpextrq">, VEX, VEX_W;
   6145 
   6146 defm PEXTRQ      : SS41I_extract64<0x16, "pextrq">;
   6147 
   6148 /// SS41I_extractf32 - SSE 4.1 extract 32 bits fp value to int reg or memory
   6149 /// destination
   6150 multiclass SS41I_extractf32<bits<8> opc, string OpcodeStr,
   6151                             OpndItins itins = DEFAULT_ITINS> {
   6152   def rr : SS4AIi8<opc, MRMDestReg, (outs GR32orGR64:$dst),
   6153                  (ins VR128:$src1, u8imm:$src2),
   6154                  !strconcat(OpcodeStr,
   6155                   "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
   6156                  [(set GR32orGR64:$dst,
   6157                     (extractelt (bc_v4i32 (v4f32 VR128:$src1)), imm:$src2))],
   6158                     itins.rr>, Sched<[WriteFBlend]>;
   6159   let SchedRW = [WriteFBlendLd, WriteRMW] in
   6160   def mr : SS4AIi8<opc, MRMDestMem, (outs),
   6161                  (ins f32mem:$dst, VR128:$src1, u8imm:$src2),
   6162                  !strconcat(OpcodeStr,
   6163                   "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
   6164                  [(store (extractelt (bc_v4i32 (v4f32 VR128:$src1)), imm:$src2),
   6165                           addr:$dst)], itins.rm>;
   6166 }
   6167 
   6168 let ExeDomain = SSEPackedSingle in {
   6169   let Predicates = [UseAVX] in
   6170     defm VEXTRACTPS : SS41I_extractf32<0x17, "vextractps">, VEX;
   6171   defm EXTRACTPS   : SS41I_extractf32<0x17, "extractps", SSE_EXTRACT_ITINS>;
   6172 }
   6173 
   6174 // Also match an EXTRACTPS store when the store is done as f32 instead of i32.
   6175 def : Pat<(store (f32 (bitconvert (extractelt (bc_v4i32 (v4f32 VR128:$src1)),
   6176                                               imm:$src2))),
   6177                  addr:$dst),
   6178           (VEXTRACTPSmr addr:$dst, VR128:$src1, imm:$src2)>,
   6179           Requires<[HasAVX]>;
   6180 def : Pat<(store (f32 (bitconvert (extractelt (bc_v4i32 (v4f32 VR128:$src1)),
   6181                                               imm:$src2))),
   6182                  addr:$dst),
   6183           (EXTRACTPSmr addr:$dst, VR128:$src1, imm:$src2)>,
   6184           Requires<[UseSSE41]>;
   6185 
   6186 //===----------------------------------------------------------------------===//
   6187 // SSE4.1 - Insert Instructions
   6188 //===----------------------------------------------------------------------===//
   6189 
   6190 multiclass SS41I_insert8<bits<8> opc, string asm, bit Is2Addr = 1> {
   6191   def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst),
   6192       (ins VR128:$src1, GR32orGR64:$src2, u8imm:$src3),
   6193       !if(Is2Addr,
   6194         !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
   6195         !strconcat(asm,
   6196                    "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
   6197       [(set VR128:$dst,
   6198         (X86pinsrb VR128:$src1, GR32orGR64:$src2, imm:$src3))]>,
   6199       Sched<[WriteShuffle]>;
   6200   def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst),
   6201       (ins VR128:$src1, i8mem:$src2, u8imm:$src3),
   6202       !if(Is2Addr,
   6203         !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
   6204         !strconcat(asm,
   6205                    "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
   6206       [(set VR128:$dst,
   6207         (X86pinsrb VR128:$src1, (extloadi8 addr:$src2),
   6208                    imm:$src3))]>, Sched<[WriteShuffleLd, ReadAfterLd]>;
   6209 }
   6210 
   6211 let Predicates = [HasAVX, NoBWI] in
   6212   defm VPINSRB : SS41I_insert8<0x20, "vpinsrb", 0>, VEX_4V;
   6213 let Constraints = "$src1 = $dst" in
   6214   defm PINSRB  : SS41I_insert8<0x20, "pinsrb">;
   6215 
   6216 multiclass SS41I_insert32<bits<8> opc, string asm, bit Is2Addr = 1> {
   6217   def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst),
   6218       (ins VR128:$src1, GR32:$src2, u8imm:$src3),
   6219       !if(Is2Addr,
   6220         !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
   6221         !strconcat(asm,
   6222                    "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
   6223       [(set VR128:$dst,
   6224         (v4i32 (insertelt VR128:$src1, GR32:$src2, imm:$src3)))]>,
   6225       Sched<[WriteShuffle]>;
   6226   def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst),
   6227       (ins VR128:$src1, i32mem:$src2, u8imm:$src3),
   6228       !if(Is2Addr,
   6229         !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
   6230         !strconcat(asm,
   6231                    "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
   6232       [(set VR128:$dst,
   6233         (v4i32 (insertelt VR128:$src1, (loadi32 addr:$src2),
   6234                           imm:$src3)))]>, Sched<[WriteShuffleLd, ReadAfterLd]>;
   6235 }
   6236 
   6237 let Predicates = [HasAVX, NoDQI] in
   6238   defm VPINSRD : SS41I_insert32<0x22, "vpinsrd", 0>, VEX_4V;
   6239 let Constraints = "$src1 = $dst" in
   6240   defm PINSRD : SS41I_insert32<0x22, "pinsrd">;
   6241 
   6242 multiclass SS41I_insert64<bits<8> opc, string asm, bit Is2Addr = 1> {
   6243   def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst),
   6244       (ins VR128:$src1, GR64:$src2, u8imm:$src3),
   6245       !if(Is2Addr,
   6246         !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
   6247         !strconcat(asm,
   6248                    "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
   6249       [(set VR128:$dst,
   6250         (v2i64 (insertelt VR128:$src1, GR64:$src2, imm:$src3)))]>,
   6251       Sched<[WriteShuffle]>;
   6252   def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst),
   6253       (ins VR128:$src1, i64mem:$src2, u8imm:$src3),
   6254       !if(Is2Addr,
   6255         !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
   6256         !strconcat(asm,
   6257                    "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
   6258       [(set VR128:$dst,
   6259         (v2i64 (insertelt VR128:$src1, (loadi64 addr:$src2),
   6260                           imm:$src3)))]>, Sched<[WriteShuffleLd, ReadAfterLd]>;
   6261 }
   6262 
   6263 let Predicates = [HasAVX, NoDQI] in
   6264   defm VPINSRQ : SS41I_insert64<0x22, "vpinsrq", 0>, VEX_4V, VEX_W;
   6265 let Constraints = "$src1 = $dst" in
   6266   defm PINSRQ : SS41I_insert64<0x22, "pinsrq">, REX_W;
   6267 
   6268 // insertps has a few different modes, there's the first two here below which
   6269 // are optimized inserts that won't zero arbitrary elements in the destination
   6270 // vector. The next one matches the intrinsic and could zero arbitrary elements
   6271 // in the target vector.
   6272 multiclass SS41I_insertf32<bits<8> opc, string asm, bit Is2Addr = 1,
   6273                            OpndItins itins = DEFAULT_ITINS> {
   6274   def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst),
   6275       (ins VR128:$src1, VR128:$src2, u8imm:$src3),
   6276       !if(Is2Addr,
   6277         !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
   6278         !strconcat(asm,
   6279                    "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
   6280       [(set VR128:$dst,
   6281         (X86insertps VR128:$src1, VR128:$src2, imm:$src3))], itins.rr>,
   6282       Sched<[WriteFShuffle]>;
   6283   def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst),
   6284       (ins VR128:$src1, f32mem:$src2, u8imm:$src3),
   6285       !if(Is2Addr,
   6286         !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
   6287         !strconcat(asm,
   6288                    "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
   6289       [(set VR128:$dst,
   6290         (X86insertps VR128:$src1,
   6291                    (v4f32 (scalar_to_vector (loadf32 addr:$src2))),
   6292                     imm:$src3))], itins.rm>,
   6293       Sched<[WriteFShuffleLd, ReadAfterLd]>;
   6294 }
   6295 
   6296 let ExeDomain = SSEPackedSingle in {
   6297   let Predicates = [UseAVX] in
   6298     defm VINSERTPS : SS41I_insertf32<0x21, "vinsertps", 0>, VEX_4V;
   6299   let Constraints = "$src1 = $dst" in
   6300     defm INSERTPS : SS41I_insertf32<0x21, "insertps", 1, SSE_INSERT_ITINS>;
   6301 }
   6302 
   6303 let Predicates = [UseSSE41] in {
   6304   // If we're inserting an element from a load or a null pshuf of a load,
   6305   // fold the load into the insertps instruction.
   6306   def : Pat<(v4f32 (X86insertps (v4f32 VR128:$src1), (X86PShufd (v4f32
   6307                        (scalar_to_vector (loadf32 addr:$src2))), (i8 0)),
   6308                    imm:$src3)),
   6309             (INSERTPSrm VR128:$src1, addr:$src2, imm:$src3)>;
   6310   def : Pat<(v4f32 (X86insertps (v4f32 VR128:$src1), (X86PShufd
   6311                       (loadv4f32 addr:$src2), (i8 0)), imm:$src3)),
   6312             (INSERTPSrm VR128:$src1, addr:$src2, imm:$src3)>;
   6313 }
   6314 
   6315 let Predicates = [UseAVX] in {
   6316   // If we're inserting an element from a vbroadcast of a load, fold the
   6317   // load into the X86insertps instruction.
   6318   def : Pat<(v4f32 (X86insertps (v4f32 VR128:$src1),
   6319                 (X86VBroadcast (loadf32 addr:$src2)), imm:$src3)),
   6320             (VINSERTPSrm VR128:$src1, addr:$src2, imm:$src3)>;
   6321   def : Pat<(v4f32 (X86insertps (v4f32 VR128:$src1),
   6322                 (X86VBroadcast (loadv4f32 addr:$src2)), imm:$src3)),
   6323             (VINSERTPSrm VR128:$src1, addr:$src2, imm:$src3)>;
   6324 }
   6325 
   6326 //===----------------------------------------------------------------------===//
   6327 // SSE4.1 - Round Instructions
   6328 //===----------------------------------------------------------------------===//
   6329 
   6330 multiclass sse41_fp_unop_rm<bits<8> opcps, bits<8> opcpd, string OpcodeStr,
   6331                             X86MemOperand x86memop, RegisterClass RC,
   6332                             PatFrag mem_frag32, PatFrag mem_frag64,
   6333                             Intrinsic V4F32Int, Intrinsic V2F64Int> {
   6334 let ExeDomain = SSEPackedSingle in {
   6335   // Intrinsic operation, reg.
   6336   // Vector intrinsic operation, reg
   6337   def PSr : SS4AIi8<opcps, MRMSrcReg,
   6338                     (outs RC:$dst), (ins RC:$src1, i32u8imm:$src2),
   6339                     !strconcat(OpcodeStr,
   6340                     "ps\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
   6341                     [(set RC:$dst, (V4F32Int RC:$src1, imm:$src2))],
   6342                     IIC_SSE_ROUNDPS_REG>, Sched<[WriteFAdd]>;
   6343 
   6344   // Vector intrinsic operation, mem
   6345   def PSm : SS4AIi8<opcps, MRMSrcMem,
   6346                     (outs RC:$dst), (ins x86memop:$src1, i32u8imm:$src2),
   6347                     !strconcat(OpcodeStr,
   6348                     "ps\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
   6349                     [(set RC:$dst,
   6350                           (V4F32Int (mem_frag32 addr:$src1),imm:$src2))],
   6351                           IIC_SSE_ROUNDPS_MEM>, Sched<[WriteFAddLd]>;
   6352 } // ExeDomain = SSEPackedSingle
   6353 
   6354 let ExeDomain = SSEPackedDouble in {
   6355   // Vector intrinsic operation, reg
   6356   def PDr : SS4AIi8<opcpd, MRMSrcReg,
   6357                     (outs RC:$dst), (ins RC:$src1, i32u8imm:$src2),
   6358                     !strconcat(OpcodeStr,
   6359                     "pd\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
   6360                     [(set RC:$dst, (V2F64Int RC:$src1, imm:$src2))],
   6361                     IIC_SSE_ROUNDPS_REG>, Sched<[WriteFAdd]>;
   6362 
   6363   // Vector intrinsic operation, mem
   6364   def PDm : SS4AIi8<opcpd, MRMSrcMem,
   6365                     (outs RC:$dst), (ins x86memop:$src1, i32u8imm:$src2),
   6366                     !strconcat(OpcodeStr,
   6367                     "pd\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
   6368                     [(set RC:$dst,
   6369                           (V2F64Int (mem_frag64 addr:$src1),imm:$src2))],
   6370                           IIC_SSE_ROUNDPS_REG>, Sched<[WriteFAddLd]>;
   6371 } // ExeDomain = SSEPackedDouble
   6372 }
   6373 
   6374 multiclass sse41_fp_binop_rm<bits<8> opcss, bits<8> opcsd,
   6375                             string OpcodeStr,
   6376                             Intrinsic F32Int,
   6377                             Intrinsic F64Int, bit Is2Addr = 1> {
   6378 let ExeDomain = GenericDomain in {
   6379   // Operation, reg.
   6380   let hasSideEffects = 0 in
   6381   def SSr : SS4AIi8<opcss, MRMSrcReg,
   6382       (outs FR32:$dst), (ins FR32:$src1, FR32:$src2, i32u8imm:$src3),
   6383       !if(Is2Addr,
   6384           !strconcat(OpcodeStr,
   6385               "ss\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
   6386           !strconcat(OpcodeStr,
   6387               "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
   6388       []>, Sched<[WriteFAdd]>;
   6389 
   6390   // Intrinsic operation, reg.
   6391   let isCodeGenOnly = 1 in
   6392   def SSr_Int : SS4AIi8<opcss, MRMSrcReg,
   6393         (outs VR128:$dst), (ins VR128:$src1, VR128:$src2, i32u8imm:$src3),
   6394         !if(Is2Addr,
   6395             !strconcat(OpcodeStr,
   6396                 "ss\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
   6397             !strconcat(OpcodeStr,
   6398                 "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
   6399         [(set VR128:$dst, (F32Int VR128:$src1, VR128:$src2, imm:$src3))]>,
   6400         Sched<[WriteFAdd]>;
   6401 
   6402   // Intrinsic operation, mem.
   6403   def SSm : SS4AIi8<opcss, MRMSrcMem,
   6404         (outs VR128:$dst), (ins VR128:$src1, ssmem:$src2, i32u8imm:$src3),
   6405         !if(Is2Addr,
   6406             !strconcat(OpcodeStr,
   6407                 "ss\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
   6408             !strconcat(OpcodeStr,
   6409                 "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
   6410         [(set VR128:$dst,
   6411              (F32Int VR128:$src1, sse_load_f32:$src2, imm:$src3))]>,
   6412         Sched<[WriteFAddLd, ReadAfterLd]>;
   6413 
   6414   // Operation, reg.
   6415   let hasSideEffects = 0 in
   6416   def SDr : SS4AIi8<opcsd, MRMSrcReg,
   6417         (outs FR64:$dst), (ins FR64:$src1, FR64:$src2, i32u8imm:$src3),
   6418         !if(Is2Addr,
   6419             !strconcat(OpcodeStr,
   6420                 "sd\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
   6421             !strconcat(OpcodeStr,
   6422                 "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
   6423         []>, Sched<[WriteFAdd]>;
   6424 
   6425   // Intrinsic operation, reg.
   6426   let isCodeGenOnly = 1 in
   6427   def SDr_Int : SS4AIi8<opcsd, MRMSrcReg,
   6428         (outs VR128:$dst), (ins VR128:$src1, VR128:$src2, i32u8imm:$src3),
   6429         !if(Is2Addr,
   6430             !strconcat(OpcodeStr,
   6431                 "sd\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
   6432             !strconcat(OpcodeStr,
   6433                 "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
   6434         [(set VR128:$dst, (F64Int VR128:$src1, VR128:$src2, imm:$src3))]>,
   6435         Sched<[WriteFAdd]>;
   6436 
   6437   // Intrinsic operation, mem.
   6438   def SDm : SS4AIi8<opcsd, MRMSrcMem,
   6439         (outs VR128:$dst), (ins VR128:$src1, sdmem:$src2, i32u8imm:$src3),
   6440         !if(Is2Addr,
   6441             !strconcat(OpcodeStr,
   6442                 "sd\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
   6443             !strconcat(OpcodeStr,
   6444                 "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
   6445         [(set VR128:$dst,
   6446               (F64Int VR128:$src1, sse_load_f64:$src2, imm:$src3))]>,
   6447         Sched<[WriteFAddLd, ReadAfterLd]>;
   6448 } // ExeDomain = GenericDomain
   6449 }
   6450 
   6451 // FP round - roundss, roundps, roundsd, roundpd
   6452 let Predicates = [HasAVX] in {
   6453   // Intrinsic form
   6454   defm VROUND  : sse41_fp_unop_rm<0x08, 0x09, "vround", f128mem, VR128,
   6455                                   loadv4f32, loadv2f64,
   6456                                   int_x86_sse41_round_ps,
   6457                                   int_x86_sse41_round_pd>, VEX;
   6458   defm VROUNDY : sse41_fp_unop_rm<0x08, 0x09, "vround", f256mem, VR256,
   6459                                   loadv8f32, loadv4f64,
   6460                                   int_x86_avx_round_ps_256,
   6461                                   int_x86_avx_round_pd_256>, VEX, VEX_L;
   6462   defm VROUND  : sse41_fp_binop_rm<0x0A, 0x0B, "vround",
   6463                                   int_x86_sse41_round_ss,
   6464                                   int_x86_sse41_round_sd, 0>, VEX_4V, VEX_LIG;
   6465 }
   6466 
   6467 let Predicates = [UseAVX] in {
   6468   def : Pat<(ffloor FR32:$src),
   6469             (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x9))>;
   6470   def : Pat<(f64 (ffloor FR64:$src)),
   6471             (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x9))>;
   6472   def : Pat<(f32 (fnearbyint FR32:$src)),
   6473             (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0xC))>;
   6474   def : Pat<(f64 (fnearbyint FR64:$src)),
   6475             (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0xC))>;
   6476   def : Pat<(f32 (fceil FR32:$src)),
   6477             (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0xA))>;
   6478   def : Pat<(f64 (fceil FR64:$src)),
   6479             (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0xA))>;
   6480   def : Pat<(f32 (frint FR32:$src)),
   6481             (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x4))>;
   6482   def : Pat<(f64 (frint FR64:$src)),
   6483             (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x4))>;
   6484   def : Pat<(f32 (ftrunc FR32:$src)),
   6485             (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0xB))>;
   6486   def : Pat<(f64 (ftrunc FR64:$src)),
   6487             (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0xB))>;
   6488 }
   6489 
   6490 let Predicates = [HasAVX] in {
   6491   def : Pat<(v4f32 (ffloor VR128:$src)),
   6492             (VROUNDPSr VR128:$src, (i32 0x9))>;
   6493   def : Pat<(v4f32 (fnearbyint VR128:$src)),
   6494             (VROUNDPSr VR128:$src, (i32 0xC))>;
   6495   def : Pat<(v4f32 (fceil VR128:$src)),
   6496             (VROUNDPSr VR128:$src, (i32 0xA))>;
   6497   def : Pat<(v4f32 (frint VR128:$src)),
   6498             (VROUNDPSr VR128:$src, (i32 0x4))>;
   6499   def : Pat<(v4f32 (ftrunc VR128:$src)),
   6500             (VROUNDPSr VR128:$src, (i32 0xB))>;
   6501 
   6502   def : Pat<(v2f64 (ffloor VR128:$src)),
   6503             (VROUNDPDr VR128:$src, (i32 0x9))>;
   6504   def : Pat<(v2f64 (fnearbyint VR128:$src)),
   6505             (VROUNDPDr VR128:$src, (i32 0xC))>;
   6506   def : Pat<(v2f64 (fceil VR128:$src)),
   6507             (VROUNDPDr VR128:$src, (i32 0xA))>;
   6508   def : Pat<(v2f64 (frint VR128:$src)),
   6509             (VROUNDPDr VR128:$src, (i32 0x4))>;
   6510   def : Pat<(v2f64 (ftrunc VR128:$src)),
   6511             (VROUNDPDr VR128:$src, (i32 0xB))>;
   6512 
   6513   def : Pat<(v8f32 (ffloor VR256:$src)),
   6514             (VROUNDYPSr VR256:$src, (i32 0x9))>;
   6515   def : Pat<(v8f32 (fnearbyint VR256:$src)),
   6516             (VROUNDYPSr VR256:$src, (i32 0xC))>;
   6517   def : Pat<(v8f32 (fceil VR256:$src)),
   6518             (VROUNDYPSr VR256:$src, (i32 0xA))>;
   6519   def : Pat<(v8f32 (frint VR256:$src)),
   6520             (VROUNDYPSr VR256:$src, (i32 0x4))>;
   6521   def : Pat<(v8f32 (ftrunc VR256:$src)),
   6522             (VROUNDYPSr VR256:$src, (i32 0xB))>;
   6523 
   6524   def : Pat<(v4f64 (ffloor VR256:$src)),
   6525             (VROUNDYPDr VR256:$src, (i32 0x9))>;
   6526   def : Pat<(v4f64 (fnearbyint VR256:$src)),
   6527             (VROUNDYPDr VR256:$src, (i32 0xC))>;
   6528   def : Pat<(v4f64 (fceil VR256:$src)),
   6529             (VROUNDYPDr VR256:$src, (i32 0xA))>;
   6530   def : Pat<(v4f64 (frint VR256:$src)),
   6531             (VROUNDYPDr VR256:$src, (i32 0x4))>;
   6532   def : Pat<(v4f64 (ftrunc VR256:$src)),
   6533             (VROUNDYPDr VR256:$src, (i32 0xB))>;
   6534 }
   6535 
   6536 defm ROUND  : sse41_fp_unop_rm<0x08, 0x09, "round", f128mem, VR128,
   6537                                memopv4f32, memopv2f64,
   6538                                int_x86_sse41_round_ps, int_x86_sse41_round_pd>;
   6539 let Constraints = "$src1 = $dst" in
   6540 defm ROUND  : sse41_fp_binop_rm<0x0A, 0x0B, "round",
   6541                                int_x86_sse41_round_ss, int_x86_sse41_round_sd>;
   6542 
   6543 let Predicates = [UseSSE41] in {
   6544   def : Pat<(ffloor FR32:$src),
   6545             (ROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x9))>;
   6546   def : Pat<(f64 (ffloor FR64:$src)),
   6547             (ROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x9))>;
   6548   def : Pat<(f32 (fnearbyint FR32:$src)),
   6549             (ROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0xC))>;
   6550   def : Pat<(f64 (fnearbyint FR64:$src)),
   6551             (ROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0xC))>;
   6552   def : Pat<(f32 (fceil FR32:$src)),
   6553             (ROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0xA))>;
   6554   def : Pat<(f64 (fceil FR64:$src)),
   6555             (ROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0xA))>;
   6556   def : Pat<(f32 (frint FR32:$src)),
   6557             (ROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x4))>;
   6558   def : Pat<(f64 (frint FR64:$src)),
   6559             (ROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x4))>;
   6560   def : Pat<(f32 (ftrunc FR32:$src)),
   6561             (ROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0xB))>;
   6562   def : Pat<(f64 (ftrunc FR64:$src)),
   6563             (ROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0xB))>;
   6564 
   6565   def : Pat<(v4f32 (ffloor VR128:$src)),
   6566             (ROUNDPSr VR128:$src, (i32 0x9))>;
   6567   def : Pat<(v4f32 (fnearbyint VR128:$src)),
   6568             (ROUNDPSr VR128:$src, (i32 0xC))>;
   6569   def : Pat<(v4f32 (fceil VR128:$src)),
   6570             (ROUNDPSr VR128:$src, (i32 0xA))>;
   6571   def : Pat<(v4f32 (frint VR128:$src)),
   6572             (ROUNDPSr VR128:$src, (i32 0x4))>;
   6573   def : Pat<(v4f32 (ftrunc VR128:$src)),
   6574             (ROUNDPSr VR128:$src, (i32 0xB))>;
   6575 
   6576   def : Pat<(v2f64 (ffloor VR128:$src)),
   6577             (ROUNDPDr VR128:$src, (i32 0x9))>;
   6578   def : Pat<(v2f64 (fnearbyint VR128:$src)),
   6579             (ROUNDPDr VR128:$src, (i32 0xC))>;
   6580   def : Pat<(v2f64 (fceil VR128:$src)),
   6581             (ROUNDPDr VR128:$src, (i32 0xA))>;
   6582   def : Pat<(v2f64 (frint VR128:$src)),
   6583             (ROUNDPDr VR128:$src, (i32 0x4))>;
   6584   def : Pat<(v2f64 (ftrunc VR128:$src)),
   6585             (ROUNDPDr VR128:$src, (i32 0xB))>;
   6586 }
   6587 
   6588 //===----------------------------------------------------------------------===//
   6589 // SSE4.1 - Packed Bit Test
   6590 //===----------------------------------------------------------------------===//
   6591 
   6592 // ptest instruction we'll lower to this in X86ISelLowering primarily from
   6593 // the intel intrinsic that corresponds to this.
   6594 let Defs = [EFLAGS], Predicates = [HasAVX] in {
   6595 def VPTESTrr  : SS48I<0x17, MRMSrcReg, (outs), (ins VR128:$src1, VR128:$src2),
   6596                 "vptest\t{$src2, $src1|$src1, $src2}",
   6597                 [(set EFLAGS, (X86ptest VR128:$src1, (v2i64 VR128:$src2)))]>,
   6598                 Sched<[WriteVecLogic]>, VEX;
   6599 def VPTESTrm  : SS48I<0x17, MRMSrcMem, (outs), (ins VR128:$src1, f128mem:$src2),
   6600                 "vptest\t{$src2, $src1|$src1, $src2}",
   6601                 [(set EFLAGS,(X86ptest VR128:$src1, (loadv2i64 addr:$src2)))]>,
   6602                 Sched<[WriteVecLogicLd, ReadAfterLd]>, VEX;
   6603 
   6604 def VPTESTYrr : SS48I<0x17, MRMSrcReg, (outs), (ins VR256:$src1, VR256:$src2),
   6605                 "vptest\t{$src2, $src1|$src1, $src2}",
   6606                 [(set EFLAGS, (X86ptest VR256:$src1, (v4i64 VR256:$src2)))]>,
   6607                 Sched<[WriteVecLogic]>, VEX, VEX_L;
   6608 def VPTESTYrm : SS48I<0x17, MRMSrcMem, (outs), (ins VR256:$src1, i256mem:$src2),
   6609                 "vptest\t{$src2, $src1|$src1, $src2}",
   6610                 [(set EFLAGS,(X86ptest VR256:$src1, (loadv4i64 addr:$src2)))]>,
   6611                 Sched<[WriteVecLogicLd, ReadAfterLd]>, VEX, VEX_L;
   6612 }
   6613 
   6614 let Defs = [EFLAGS] in {
   6615 def PTESTrr : SS48I<0x17, MRMSrcReg, (outs), (ins VR128:$src1, VR128:$src2),
   6616               "ptest\t{$src2, $src1|$src1, $src2}",
   6617               [(set EFLAGS, (X86ptest VR128:$src1, (v2i64 VR128:$src2)))]>,
   6618               Sched<[WriteVecLogic]>;
   6619 def PTESTrm : SS48I<0x17, MRMSrcMem, (outs), (ins VR128:$src1, f128mem:$src2),
   6620               "ptest\t{$src2, $src1|$src1, $src2}",
   6621               [(set EFLAGS, (X86ptest VR128:$src1, (memopv2i64 addr:$src2)))]>,
   6622               Sched<[WriteVecLogicLd, ReadAfterLd]>;
   6623 }
   6624 
   6625 // The bit test instructions below are AVX only
   6626 multiclass avx_bittest<bits<8> opc, string OpcodeStr, RegisterClass RC,
   6627                        X86MemOperand x86memop, PatFrag mem_frag, ValueType vt> {
   6628   def rr : SS48I<opc, MRMSrcReg, (outs), (ins RC:$src1, RC:$src2),
   6629             !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
   6630             [(set EFLAGS, (X86testp RC:$src1, (vt RC:$src2)))]>,
   6631             Sched<[WriteVecLogic]>, VEX;
   6632   def rm : SS48I<opc, MRMSrcMem, (outs), (ins RC:$src1, x86memop:$src2),
   6633             !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
   6634             [(set EFLAGS, (X86testp RC:$src1, (mem_frag addr:$src2)))]>,
   6635             Sched<[WriteVecLogicLd, ReadAfterLd]>, VEX;
   6636 }
   6637 
   6638 let Defs = [EFLAGS], Predicates = [HasAVX] in {
   6639 let ExeDomain = SSEPackedSingle in {
   6640 defm VTESTPS  : avx_bittest<0x0E, "vtestps", VR128, f128mem, loadv4f32, v4f32>;
   6641 defm VTESTPSY : avx_bittest<0x0E, "vtestps", VR256, f256mem, loadv8f32, v8f32>,
   6642                             VEX_L;
   6643 }
   6644 let ExeDomain = SSEPackedDouble in {
   6645 defm VTESTPD  : avx_bittest<0x0F, "vtestpd", VR128, f128mem, loadv2f64, v2f64>;
   6646 defm VTESTPDY : avx_bittest<0x0F, "vtestpd", VR256, f256mem, loadv4f64, v4f64>,
   6647                             VEX_L;
   6648 }
   6649 }
   6650 
   6651 //===----------------------------------------------------------------------===//
   6652 // SSE4.1 - Misc Instructions
   6653 //===----------------------------------------------------------------------===//
   6654 
   6655 let Defs = [EFLAGS], Predicates = [HasPOPCNT] in {
   6656   def POPCNT16rr : I<0xB8, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src),
   6657                      "popcnt{w}\t{$src, $dst|$dst, $src}",
   6658                      [(set GR16:$dst, (ctpop GR16:$src)), (implicit EFLAGS)],
   6659                      IIC_SSE_POPCNT_RR>, Sched<[WriteFAdd]>,
   6660                      OpSize16, XS;
   6661   def POPCNT16rm : I<0xB8, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src),
   6662                      "popcnt{w}\t{$src, $dst|$dst, $src}",
   6663                      [(set GR16:$dst, (ctpop (loadi16 addr:$src))),
   6664                       (implicit EFLAGS)], IIC_SSE_POPCNT_RM>,
   6665                       Sched<[WriteFAddLd]>, OpSize16, XS;
   6666 
   6667   def POPCNT32rr : I<0xB8, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
   6668                      "popcnt{l}\t{$src, $dst|$dst, $src}",
   6669                      [(set GR32:$dst, (ctpop GR32:$src)), (implicit EFLAGS)],
   6670                      IIC_SSE_POPCNT_RR>, Sched<[WriteFAdd]>,
   6671                      OpSize32, XS;
   6672 
   6673   def POPCNT32rm : I<0xB8, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
   6674                      "popcnt{l}\t{$src, $dst|$dst, $src}",
   6675                      [(set GR32:$dst, (ctpop (loadi32 addr:$src))),
   6676                       (implicit EFLAGS)], IIC_SSE_POPCNT_RM>,
   6677                       Sched<[WriteFAddLd]>, OpSize32, XS;
   6678 
   6679   def POPCNT64rr : RI<0xB8, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src),
   6680                       "popcnt{q}\t{$src, $dst|$dst, $src}",
   6681                       [(set GR64:$dst, (ctpop GR64:$src)), (implicit EFLAGS)],
   6682                       IIC_SSE_POPCNT_RR>, Sched<[WriteFAdd]>, XS;
   6683   def POPCNT64rm : RI<0xB8, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
   6684                       "popcnt{q}\t{$src, $dst|$dst, $src}",
   6685                       [(set GR64:$dst, (ctpop (loadi64 addr:$src))),
   6686                        (implicit EFLAGS)], IIC_SSE_POPCNT_RM>,
   6687                        Sched<[WriteFAddLd]>, XS;
   6688 }
   6689 
   6690 
   6691 
   6692 // SS41I_unop_rm_int_v16 - SSE 4.1 unary operator whose type is v8i16.
   6693 multiclass SS41I_unop_rm_int_v16<bits<8> opc, string OpcodeStr,
   6694                                  Intrinsic IntId128, PatFrag ld_frag,
   6695                                  X86FoldableSchedWrite Sched> {
   6696   def rr128 : SS48I<opc, MRMSrcReg, (outs VR128:$dst),
   6697                     (ins VR128:$src),
   6698                     !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
   6699                     [(set VR128:$dst, (IntId128 VR128:$src))]>,
   6700                     Sched<[Sched]>;
   6701   def rm128 : SS48I<opc, MRMSrcMem, (outs VR128:$dst),
   6702                      (ins i128mem:$src),
   6703                      !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
   6704                      [(set VR128:$dst,
   6705                        (IntId128 (bitconvert (ld_frag addr:$src))))]>,
   6706                     Sched<[Sched.Folded]>;
   6707 }
   6708 
   6709 // PHMIN has the same profile as PSAD, thus we use the same scheduling
   6710 // model, although the naming is misleading.
   6711 let Predicates = [HasAVX] in
   6712 defm VPHMINPOSUW : SS41I_unop_rm_int_v16 <0x41, "vphminposuw",
   6713                                          int_x86_sse41_phminposuw, loadv2i64,
   6714                                          WriteVecIMul>, VEX;
   6715 defm PHMINPOSUW : SS41I_unop_rm_int_v16 <0x41, "phminposuw",
   6716                                          int_x86_sse41_phminposuw, memopv2i64,
   6717                                          WriteVecIMul>;
   6718 
   6719 /// SS48I_binop_rm - Simple SSE41 binary operator.
   6720 multiclass SS48I_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
   6721                           ValueType OpVT, RegisterClass RC, PatFrag memop_frag,
   6722                           X86MemOperand x86memop, bit Is2Addr = 1,
   6723                           OpndItins itins = SSE_INTALU_ITINS_P> {
   6724   let isCommutable = 1 in
   6725   def rr : SS48I<opc, MRMSrcReg, (outs RC:$dst),
   6726        (ins RC:$src1, RC:$src2),
   6727        !if(Is2Addr,
   6728            !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
   6729            !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
   6730        [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2)))]>,
   6731        Sched<[itins.Sched]>;
   6732   def rm : SS48I<opc, MRMSrcMem, (outs RC:$dst),
   6733        (ins RC:$src1, x86memop:$src2),
   6734        !if(Is2Addr,
   6735            !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
   6736            !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
   6737        [(set RC:$dst,
   6738          (OpVT (OpNode RC:$src1, (bitconvert (memop_frag addr:$src2)))))]>,
   6739        Sched<[itins.Sched.Folded, ReadAfterLd]>;
   6740 }
   6741 
   6742 /// SS48I_binop_rm2 - Simple SSE41 binary operator with different src and dst
   6743 /// types.
   6744 multiclass SS48I_binop_rm2<bits<8> opc, string OpcodeStr, SDNode OpNode,
   6745                          ValueType DstVT, ValueType SrcVT, RegisterClass RC,
   6746                          PatFrag memop_frag, X86MemOperand x86memop,
   6747                          OpndItins itins,
   6748                          bit IsCommutable = 0, bit Is2Addr = 1> {
   6749   let isCommutable = IsCommutable in
   6750   def rr : SS48I<opc, MRMSrcReg, (outs RC:$dst),
   6751        (ins RC:$src1, RC:$src2),
   6752        !if(Is2Addr,
   6753            !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
   6754            !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
   6755        [(set RC:$dst, (DstVT (OpNode (SrcVT RC:$src1), RC:$src2)))]>,
   6756        Sched<[itins.Sched]>;
   6757   def rm : SS48I<opc, MRMSrcMem, (outs RC:$dst),
   6758        (ins RC:$src1, x86memop:$src2),
   6759        !if(Is2Addr,
   6760            !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
   6761            !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
   6762        [(set RC:$dst, (DstVT (OpNode (SrcVT RC:$src1),
   6763                                      (bitconvert (memop_frag addr:$src2)))))]>,
   6764        Sched<[itins.Sched.Folded, ReadAfterLd]>;
   6765 }
   6766 
   6767 let Predicates = [HasAVX, NoVLX] in {
   6768   defm VPMINSD   : SS48I_binop_rm<0x39, "vpminsd", smin, v4i32, VR128,
   6769                                   loadv2i64, i128mem, 0, SSE_INTALU_ITINS_P>,
   6770                                   VEX_4V;
   6771   defm VPMINUD   : SS48I_binop_rm<0x3B, "vpminud", umin, v4i32, VR128,
   6772                                   loadv2i64, i128mem, 0, SSE_INTALU_ITINS_P>,
   6773                                   VEX_4V;
   6774   defm VPMAXSD   : SS48I_binop_rm<0x3D, "vpmaxsd", smax, v4i32, VR128,
   6775                                   loadv2i64, i128mem, 0, SSE_INTALU_ITINS_P>,
   6776                                   VEX_4V;
   6777   defm VPMAXUD   : SS48I_binop_rm<0x3F, "vpmaxud", umax, v4i32, VR128,
   6778                                   loadv2i64, i128mem, 0, SSE_INTALU_ITINS_P>,
   6779                                   VEX_4V;
   6780   defm VPMULDQ   : SS48I_binop_rm2<0x28, "vpmuldq", X86pmuldq, v2i64, v4i32,
   6781                                    VR128, loadv2i64, i128mem,
   6782                                    SSE_INTMUL_ITINS_P, 1, 0>, VEX_4V;
   6783 }
   6784 let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
   6785   defm VPMINSB   : SS48I_binop_rm<0x38, "vpminsb", smin, v16i8, VR128,
   6786                                   loadv2i64, i128mem, 0, SSE_INTALU_ITINS_P>,
   6787                                   VEX_4V;
   6788   defm VPMINUW   : SS48I_binop_rm<0x3A, "vpminuw", umin, v8i16, VR128,
   6789                                   loadv2i64, i128mem, 0, SSE_INTALU_ITINS_P>,
   6790                                   VEX_4V;
   6791   defm VPMAXSB   : SS48I_binop_rm<0x3C, "vpmaxsb", smax, v16i8, VR128,
   6792                                   loadv2i64, i128mem, 0, SSE_INTALU_ITINS_P>,
   6793                                   VEX_4V;
   6794   defm VPMAXUW   : SS48I_binop_rm<0x3E, "vpmaxuw", umax, v8i16, VR128,
   6795                                   loadv2i64, i128mem, 0, SSE_INTALU_ITINS_P>,
   6796                                   VEX_4V;
   6797 }
   6798 
   6799 let Predicates = [HasAVX2, NoVLX] in {
   6800   defm VPMINSDY  : SS48I_binop_rm<0x39, "vpminsd", smin, v8i32, VR256,
   6801                                   loadv4i64, i256mem, 0, SSE_INTALU_ITINS_P>,
   6802                                   VEX_4V, VEX_L;
   6803   defm VPMINUDY  : SS48I_binop_rm<0x3B, "vpminud", umin, v8i32, VR256,
   6804                                   loadv4i64, i256mem, 0, SSE_INTALU_ITINS_P>,
   6805                                   VEX_4V, VEX_L;
   6806   defm VPMAXSDY  : SS48I_binop_rm<0x3D, "vpmaxsd", smax, v8i32, VR256,
   6807                                   loadv4i64, i256mem, 0, SSE_INTALU_ITINS_P>,
   6808                                   VEX_4V, VEX_L;
   6809   defm VPMAXUDY  : SS48I_binop_rm<0x3F, "vpmaxud", umax, v8i32, VR256,
   6810                                   loadv4i64, i256mem, 0, SSE_INTALU_ITINS_P>,
   6811                                   VEX_4V, VEX_L;
   6812   defm VPMULDQY : SS48I_binop_rm2<0x28, "vpmuldq", X86pmuldq, v4i64, v8i32,
   6813                                   VR256, loadv4i64, i256mem,
   6814                                   SSE_INTMUL_ITINS_P, 1, 0>, VEX_4V, VEX_L;
   6815 }
   6816 let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
   6817   defm VPMINSBY  : SS48I_binop_rm<0x38, "vpminsb", smin, v32i8, VR256,
   6818                                   loadv4i64, i256mem, 0, SSE_INTALU_ITINS_P>,
   6819                                   VEX_4V, VEX_L;
   6820   defm VPMINUWY  : SS48I_binop_rm<0x3A, "vpminuw", umin, v16i16, VR256,
   6821                                   loadv4i64, i256mem, 0, SSE_INTALU_ITINS_P>,
   6822                                   VEX_4V, VEX_L;
   6823   defm VPMAXSBY  : SS48I_binop_rm<0x3C, "vpmaxsb", smax, v32i8, VR256,
   6824                                   loadv4i64, i256mem, 0, SSE_INTALU_ITINS_P>,
   6825                                   VEX_4V, VEX_L;
   6826   defm VPMAXUWY  : SS48I_binop_rm<0x3E, "vpmaxuw", umax, v16i16, VR256,
   6827                                   loadv4i64, i256mem, 0, SSE_INTALU_ITINS_P>,
   6828                                   VEX_4V, VEX_L;
   6829 }
   6830 
   6831 let Constraints = "$src1 = $dst" in {
   6832   defm PMINSB   : SS48I_binop_rm<0x38, "pminsb", smin, v16i8, VR128,
   6833                                  memopv2i64, i128mem, 1, SSE_INTALU_ITINS_P>;
   6834   defm PMINSD   : SS48I_binop_rm<0x39, "pminsd", smin, v4i32, VR128,
   6835                                  memopv2i64, i128mem, 1, SSE_INTALU_ITINS_P>;
   6836   defm PMINUD   : SS48I_binop_rm<0x3B, "pminud", umin, v4i32, VR128,
   6837                                  memopv2i64, i128mem, 1, SSE_INTALU_ITINS_P>;
   6838   defm PMINUW   : SS48I_binop_rm<0x3A, "pminuw", umin, v8i16, VR128,
   6839                                  memopv2i64, i128mem, 1, SSE_INTALU_ITINS_P>;
   6840   defm PMAXSB   : SS48I_binop_rm<0x3C, "pmaxsb", smax, v16i8, VR128,
   6841                                  memopv2i64, i128mem, 1, SSE_INTALU_ITINS_P>;
   6842   defm PMAXSD   : SS48I_binop_rm<0x3D, "pmaxsd", smax, v4i32, VR128,
   6843                                  memopv2i64, i128mem, 1, SSE_INTALU_ITINS_P>;
   6844   defm PMAXUD   : SS48I_binop_rm<0x3F, "pmaxud", umax, v4i32, VR128,
   6845                                  memopv2i64, i128mem, 1, SSE_INTALU_ITINS_P>;
   6846   defm PMAXUW   : SS48I_binop_rm<0x3E, "pmaxuw", umax, v8i16, VR128,
   6847                                  memopv2i64, i128mem, 1, SSE_INTALU_ITINS_P>;
   6848   defm PMULDQ   : SS48I_binop_rm2<0x28, "pmuldq", X86pmuldq, v2i64, v4i32,
   6849                                   VR128, memopv2i64, i128mem,
   6850                                   SSE_INTMUL_ITINS_P, 1>;
   6851 }
   6852 
   6853 let Predicates = [HasAVX, NoVLX] in {
   6854   defm VPMULLD  : SS48I_binop_rm<0x40, "vpmulld", mul, v4i32, VR128,
   6855                                  memopv2i64, i128mem, 0, SSE_PMULLD_ITINS>,
   6856                                  VEX_4V;
   6857   defm VPCMPEQQ : SS48I_binop_rm<0x29, "vpcmpeqq", X86pcmpeq, v2i64, VR128,
   6858                                  memopv2i64, i128mem, 0, SSE_INTALU_ITINS_P>,
   6859                                  VEX_4V;
   6860 }
   6861 let Predicates = [HasAVX2] in {
   6862   defm VPMULLDY  : SS48I_binop_rm<0x40, "vpmulld", mul, v8i32, VR256,
   6863                                   loadv4i64, i256mem, 0, SSE_PMULLD_ITINS>,
   6864                                   VEX_4V, VEX_L;
   6865   defm VPCMPEQQY : SS48I_binop_rm<0x29, "vpcmpeqq", X86pcmpeq, v4i64, VR256,
   6866                                   loadv4i64, i256mem, 0, SSE_INTALU_ITINS_P>,
   6867                                   VEX_4V, VEX_L;
   6868 }
   6869 
   6870 let Constraints = "$src1 = $dst" in {
   6871   defm PMULLD  : SS48I_binop_rm<0x40, "pmulld", mul, v4i32, VR128,
   6872                                 memopv2i64, i128mem, 1, SSE_PMULLD_ITINS>;
   6873   defm PCMPEQQ : SS48I_binop_rm<0x29, "pcmpeqq", X86pcmpeq, v2i64, VR128,
   6874                                 memopv2i64, i128mem, 1, SSE_INTALUQ_ITINS_P>;
   6875 }
   6876 
   6877 /// SS41I_binop_rmi_int - SSE 4.1 binary operator with 8-bit immediate
   6878 multiclass SS41I_binop_rmi_int<bits<8> opc, string OpcodeStr,
   6879                  Intrinsic IntId, RegisterClass RC, PatFrag memop_frag,
   6880                  X86MemOperand x86memop, bit Is2Addr = 1,
   6881                  OpndItins itins = DEFAULT_ITINS> {
   6882   let isCommutable = 1 in
   6883   def rri : SS4AIi8<opc, MRMSrcReg, (outs RC:$dst),
   6884         (ins RC:$src1, RC:$src2, u8imm:$src3),
   6885         !if(Is2Addr,
   6886             !strconcat(OpcodeStr,
   6887                 "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
   6888             !strconcat(OpcodeStr,
   6889                 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
   6890         [(set RC:$dst, (IntId RC:$src1, RC:$src2, imm:$src3))], itins.rr>,
   6891         Sched<[itins.Sched]>;
   6892   def rmi : SS4AIi8<opc, MRMSrcMem, (outs RC:$dst),
   6893         (ins RC:$src1, x86memop:$src2, u8imm:$src3),
   6894         !if(Is2Addr,
   6895             !strconcat(OpcodeStr,
   6896                 "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
   6897             !strconcat(OpcodeStr,
   6898                 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
   6899         [(set RC:$dst,
   6900           (IntId RC:$src1,
   6901            (bitconvert (memop_frag addr:$src2)), imm:$src3))], itins.rm>,
   6902         Sched<[itins.Sched.Folded, ReadAfterLd]>;
   6903 }
   6904 
   6905 /// SS41I_binop_rmi - SSE 4.1 binary operator with 8-bit immediate
   6906 multiclass SS41I_binop_rmi<bits<8> opc, string OpcodeStr, SDNode OpNode,
   6907                            ValueType OpVT, RegisterClass RC, PatFrag memop_frag,
   6908                            X86MemOperand x86memop, bit Is2Addr = 1,
   6909                            OpndItins itins = DEFAULT_ITINS> {
   6910   let isCommutable = 1 in
   6911   def rri : SS4AIi8<opc, MRMSrcReg, (outs RC:$dst),
   6912         (ins RC:$src1, RC:$src2, u8imm:$src3),
   6913         !if(Is2Addr,
   6914             !strconcat(OpcodeStr,
   6915                 "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
   6916             !strconcat(OpcodeStr,
   6917                 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
   6918         [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2, imm:$src3)))],
   6919         itins.rr>, Sched<[itins.Sched]>;
   6920   def rmi : SS4AIi8<opc, MRMSrcMem, (outs RC:$dst),
   6921         (ins RC:$src1, x86memop:$src2, u8imm:$src3),
   6922         !if(Is2Addr,
   6923             !strconcat(OpcodeStr,
   6924                 "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
   6925             !strconcat(OpcodeStr,
   6926                 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
   6927         [(set RC:$dst,
   6928           (OpVT (OpNode RC:$src1,
   6929                  (bitconvert (memop_frag addr:$src2)), imm:$src3)))], itins.rm>,
   6930         Sched<[itins.Sched.Folded, ReadAfterLd]>;
   6931 }
   6932 
   6933 let Predicates = [HasAVX] in {
   6934   let isCommutable = 0 in {
   6935     defm VMPSADBW : SS41I_binop_rmi_int<0x42, "vmpsadbw", int_x86_sse41_mpsadbw,
   6936                                         VR128, loadv2i64, i128mem, 0,
   6937                                         DEFAULT_ITINS_MPSADSCHED>, VEX_4V;
   6938   }
   6939 
   6940   let ExeDomain = SSEPackedSingle in {
   6941   defm VBLENDPS : SS41I_binop_rmi<0x0C, "vblendps", X86Blendi, v4f32,
   6942                                   VR128, loadv4f32, f128mem, 0,
   6943                                   DEFAULT_ITINS_FBLENDSCHED>, VEX_4V;
   6944   defm VBLENDPSY : SS41I_binop_rmi<0x0C, "vblendps", X86Blendi, v8f32,
   6945                                    VR256, loadv8f32, f256mem, 0,
   6946                                    DEFAULT_ITINS_FBLENDSCHED>, VEX_4V, VEX_L;
   6947   }
   6948   let ExeDomain = SSEPackedDouble in {
   6949   defm VBLENDPD : SS41I_binop_rmi<0x0D, "vblendpd", X86Blendi, v2f64,
   6950                                   VR128, loadv2f64, f128mem, 0,
   6951                                   DEFAULT_ITINS_FBLENDSCHED>, VEX_4V;
   6952   defm VBLENDPDY : SS41I_binop_rmi<0x0D, "vblendpd", X86Blendi, v4f64,
   6953                                    VR256, loadv4f64, f256mem, 0,
   6954                                    DEFAULT_ITINS_FBLENDSCHED>, VEX_4V, VEX_L;
   6955   }
   6956   defm VPBLENDW : SS41I_binop_rmi<0x0E, "vpblendw", X86Blendi, v8i16,
   6957                                   VR128, loadv2i64, i128mem, 0,
   6958                                   DEFAULT_ITINS_BLENDSCHED>, VEX_4V;
   6959 
   6960   let ExeDomain = SSEPackedSingle in
   6961   defm VDPPS : SS41I_binop_rmi_int<0x40, "vdpps", int_x86_sse41_dpps,
   6962                                    VR128, loadv4f32, f128mem, 0,
   6963                                    SSE_DPPS_ITINS>, VEX_4V;
   6964   let ExeDomain = SSEPackedDouble in
   6965   defm VDPPD : SS41I_binop_rmi_int<0x41, "vdppd", int_x86_sse41_dppd,
   6966                                    VR128, loadv2f64, f128mem, 0,
   6967                                    SSE_DPPS_ITINS>, VEX_4V;
   6968   let ExeDomain = SSEPackedSingle in
   6969   defm VDPPSY : SS41I_binop_rmi_int<0x40, "vdpps", int_x86_avx_dp_ps_256,
   6970                                     VR256, loadv8f32, i256mem, 0,
   6971                                     SSE_DPPS_ITINS>, VEX_4V, VEX_L;
   6972 }
   6973 
   6974 let Predicates = [HasAVX2] in {
   6975   let isCommutable = 0 in {
   6976   defm VMPSADBWY : SS41I_binop_rmi_int<0x42, "vmpsadbw", int_x86_avx2_mpsadbw,
   6977                                   VR256, loadv4i64, i256mem, 0,
   6978                                   DEFAULT_ITINS_MPSADSCHED>, VEX_4V, VEX_L;
   6979   }
   6980   defm VPBLENDWY : SS41I_binop_rmi<0x0E, "vpblendw", X86Blendi, v16i16,
   6981                                    VR256, loadv4i64, i256mem, 0,
   6982                                    DEFAULT_ITINS_BLENDSCHED>, VEX_4V, VEX_L;
   6983 }
   6984 
   6985 let Constraints = "$src1 = $dst" in {
   6986   let isCommutable = 0 in {
   6987   defm MPSADBW : SS41I_binop_rmi_int<0x42, "mpsadbw", int_x86_sse41_mpsadbw,
   6988                                      VR128, memopv2i64, i128mem,
   6989                                      1, SSE_MPSADBW_ITINS>;
   6990   }
   6991   let ExeDomain = SSEPackedSingle in
   6992   defm BLENDPS : SS41I_binop_rmi<0x0C, "blendps", X86Blendi, v4f32,
   6993                                  VR128, memopv4f32, f128mem,
   6994                                  1, SSE_INTALU_ITINS_FBLEND_P>;
   6995   let ExeDomain = SSEPackedDouble in
   6996   defm BLENDPD : SS41I_binop_rmi<0x0D, "blendpd", X86Blendi, v2f64,
   6997                                  VR128, memopv2f64, f128mem,
   6998                                  1, SSE_INTALU_ITINS_FBLEND_P>;
   6999   defm PBLENDW : SS41I_binop_rmi<0x0E, "pblendw", X86Blendi, v8i16,
   7000                                  VR128, memopv2i64, i128mem,
   7001                                  1, SSE_INTALU_ITINS_BLEND_P>;
   7002   let ExeDomain = SSEPackedSingle in
   7003   defm DPPS : SS41I_binop_rmi_int<0x40, "dpps", int_x86_sse41_dpps,
   7004                                   VR128, memopv4f32, f128mem, 1,
   7005                                   SSE_DPPS_ITINS>;
   7006   let ExeDomain = SSEPackedDouble in
   7007   defm DPPD : SS41I_binop_rmi_int<0x41, "dppd", int_x86_sse41_dppd,
   7008                                   VR128, memopv2f64, f128mem, 1,
   7009                                   SSE_DPPD_ITINS>;
   7010 }
   7011 
   7012 /// SS41I_quaternary_int_avx - AVX SSE 4.1 with 4 operators
   7013 multiclass SS41I_quaternary_int_avx<bits<8> opc, string OpcodeStr,
   7014                                     RegisterClass RC, X86MemOperand x86memop,
   7015                                     PatFrag mem_frag, Intrinsic IntId,
   7016                                     X86FoldableSchedWrite Sched> {
   7017   def rr : Ii8<opc, MRMSrcReg, (outs RC:$dst),
   7018                   (ins RC:$src1, RC:$src2, RC:$src3),
   7019                   !strconcat(OpcodeStr,
   7020                     "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
   7021                   [(set RC:$dst, (IntId RC:$src1, RC:$src2, RC:$src3))],
   7022                   NoItinerary, SSEPackedInt>, TAPD, VEX_4V, VEX_I8IMM,
   7023                 Sched<[Sched]>;
   7024 
   7025   def rm : Ii8<opc, MRMSrcMem, (outs RC:$dst),
   7026                   (ins RC:$src1, x86memop:$src2, RC:$src3),
   7027                   !strconcat(OpcodeStr,
   7028                     "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
   7029                   [(set RC:$dst,
   7030                         (IntId RC:$src1, (bitconvert (mem_frag addr:$src2)),
   7031                                RC:$src3))],
   7032                   NoItinerary, SSEPackedInt>, TAPD, VEX_4V, VEX_I8IMM,
   7033                 Sched<[Sched.Folded, ReadAfterLd]>;
   7034 }
   7035 
   7036 let Predicates = [HasAVX] in {
   7037 let ExeDomain = SSEPackedDouble in {
   7038 defm VBLENDVPD  : SS41I_quaternary_int_avx<0x4B, "vblendvpd", VR128, f128mem,
   7039                                            loadv2f64, int_x86_sse41_blendvpd,
   7040                                            WriteFVarBlend>;
   7041 defm VBLENDVPDY : SS41I_quaternary_int_avx<0x4B, "vblendvpd", VR256, f256mem,
   7042                                   loadv4f64, int_x86_avx_blendv_pd_256,
   7043                                   WriteFVarBlend>, VEX_L;
   7044 } // ExeDomain = SSEPackedDouble
   7045 let ExeDomain = SSEPackedSingle in {
   7046 defm VBLENDVPS  : SS41I_quaternary_int_avx<0x4A, "vblendvps", VR128, f128mem,
   7047                                            loadv4f32, int_x86_sse41_blendvps,
   7048                                            WriteFVarBlend>;
   7049 defm VBLENDVPSY : SS41I_quaternary_int_avx<0x4A, "vblendvps", VR256, f256mem,
   7050                                   loadv8f32, int_x86_avx_blendv_ps_256,
   7051                                   WriteFVarBlend>, VEX_L;
   7052 } // ExeDomain = SSEPackedSingle
   7053 defm VPBLENDVB  : SS41I_quaternary_int_avx<0x4C, "vpblendvb", VR128, i128mem,
   7054                                            loadv2i64, int_x86_sse41_pblendvb,
   7055                                            WriteVarBlend>;
   7056 }
   7057 
   7058 let Predicates = [HasAVX2] in {
   7059 defm VPBLENDVBY : SS41I_quaternary_int_avx<0x4C, "vpblendvb", VR256, i256mem,
   7060                                       loadv4i64, int_x86_avx2_pblendvb,
   7061                                       WriteVarBlend>, VEX_L;
   7062 }
   7063 
   7064 let Predicates = [HasAVX] in {
   7065   def : Pat<(v16i8 (vselect (v16i8 VR128:$mask), (v16i8 VR128:$src1),
   7066                             (v16i8 VR128:$src2))),
   7067             (VPBLENDVBrr VR128:$src2, VR128:$src1, VR128:$mask)>;
   7068   def : Pat<(v4i32 (vselect (v4i32 VR128:$mask), (v4i32 VR128:$src1),
   7069                             (v4i32 VR128:$src2))),
   7070             (VBLENDVPSrr VR128:$src2, VR128:$src1, VR128:$mask)>;
   7071   def : Pat<(v4f32 (vselect (v4i32 VR128:$mask), (v4f32 VR128:$src1),
   7072                             (v4f32 VR128:$src2))),
   7073             (VBLENDVPSrr VR128:$src2, VR128:$src1, VR128:$mask)>;
   7074   def : Pat<(v2i64 (vselect (v2i64 VR128:$mask), (v2i64 VR128:$src1),
   7075                             (v2i64 VR128:$src2))),
   7076             (VBLENDVPDrr VR128:$src2, VR128:$src1, VR128:$mask)>;
   7077   def : Pat<(v2f64 (vselect (v2i64 VR128:$mask), (v2f64 VR128:$src1),
   7078                             (v2f64 VR128:$src2))),
   7079             (VBLENDVPDrr VR128:$src2, VR128:$src1, VR128:$mask)>;
   7080   def : Pat<(v8i32 (vselect (v8i32 VR256:$mask), (v8i32 VR256:$src1),
   7081                             (v8i32 VR256:$src2))),
   7082             (VBLENDVPSYrr VR256:$src2, VR256:$src1, VR256:$mask)>;
   7083   def : Pat<(v8f32 (vselect (v8i32 VR256:$mask), (v8f32 VR256:$src1),
   7084                             (v8f32 VR256:$src2))),
   7085             (VBLENDVPSYrr VR256:$src2, VR256:$src1, VR256:$mask)>;
   7086   def : Pat<(v4i64 (vselect (v4i64 VR256:$mask), (v4i64 VR256:$src1),
   7087                             (v4i64 VR256:$src2))),
   7088             (VBLENDVPDYrr VR256:$src2, VR256:$src1, VR256:$mask)>;
   7089   def : Pat<(v4f64 (vselect (v4i64 VR256:$mask), (v4f64 VR256:$src1),
   7090                             (v4f64 VR256:$src2))),
   7091             (VBLENDVPDYrr VR256:$src2, VR256:$src1, VR256:$mask)>;
   7092 }
   7093 
   7094 let Predicates = [HasAVX2] in {
   7095   def : Pat<(v32i8 (vselect (v32i8 VR256:$mask), (v32i8 VR256:$src1),
   7096                             (v32i8 VR256:$src2))),
   7097             (VPBLENDVBYrr VR256:$src2, VR256:$src1, VR256:$mask)>;
   7098 }
   7099 
   7100 // Patterns
   7101 // FIXME: Prefer a movss or movsd over a blendps when optimizing for size or
   7102 // on targets where they have equal performance. These were changed to use
   7103 // blends because blends have better throughput on SandyBridge and Haswell, but
   7104 // movs[s/d] are 1-2 byte shorter instructions.
   7105 let Predicates = [UseAVX] in {
   7106   let AddedComplexity = 15 in {
   7107   // Move scalar to XMM zero-extended, zeroing a VR128 then do a
   7108   // MOVS{S,D} to the lower bits.
   7109   def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector FR32:$src)))),
   7110             (VMOVSSrr (v4f32 (V_SET0)), FR32:$src)>;
   7111   def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))),
   7112             (VBLENDPSrri (v4f32 (V_SET0)), VR128:$src, (i8 1))>;
   7113   def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))),
   7114             (VPBLENDWrri (v4i32 (V_SET0)), VR128:$src, (i8 3))>;
   7115   def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector FR64:$src)))),
   7116             (VMOVSDrr (v2f64 (V_SET0)), FR64:$src)>;
   7117 
   7118   // Move low f32 and clear high bits.
   7119   def : Pat<(v8f32 (X86vzmovl (v8f32 VR256:$src))),
   7120             (VBLENDPSYrri (v8f32 (AVX_SET0)), VR256:$src, (i8 1))>;
   7121 
   7122   // Move low f64 and clear high bits.
   7123   def : Pat<(v4f64 (X86vzmovl (v4f64 VR256:$src))),
   7124             (VBLENDPDYrri (v4f64 (AVX_SET0)), VR256:$src, (i8 1))>;
   7125   }
   7126 
   7127   def : Pat<(v8f32 (X86vzmovl (insert_subvector undef,
   7128                    (v4f32 (scalar_to_vector FR32:$src)), (iPTR 0)))),
   7129             (SUBREG_TO_REG (i32 0),
   7130                            (v4f32 (VMOVSSrr (v4f32 (V_SET0)), FR32:$src)),
   7131                            sub_xmm)>;
   7132   def : Pat<(v4f64 (X86vzmovl (insert_subvector undef,
   7133                    (v2f64 (scalar_to_vector FR64:$src)), (iPTR 0)))),
   7134             (SUBREG_TO_REG (i64 0),
   7135                            (v2f64 (VMOVSDrr (v2f64 (V_SET0)), FR64:$src)),
   7136                            sub_xmm)>;
   7137 
   7138   // These will incur an FP/int domain crossing penalty, but it may be the only
   7139   // way without AVX2. Do not add any complexity because we may be able to match
   7140   // more optimal patterns defined earlier in this file.
   7141   def : Pat<(v8i32 (X86vzmovl (v8i32 VR256:$src))),
   7142             (VBLENDPSYrri (v8i32 (AVX_SET0)), VR256:$src, (i8 1))>;
   7143   def : Pat<(v4i64 (X86vzmovl (v4i64 VR256:$src))),
   7144             (VBLENDPDYrri (v4i64 (AVX_SET0)), VR256:$src, (i8 1))>;
   7145 }
   7146 
   7147 // FIXME: Prefer a movss or movsd over a blendps when optimizing for size or
   7148 // on targets where they have equal performance. These were changed to use
   7149 // blends because blends have better throughput on SandyBridge and Haswell, but
   7150 // movs[s/d] are 1-2 byte shorter instructions.
   7151 let Predicates = [UseSSE41], AddedComplexity = 15 in {
   7152   // With SSE41 we can use blends for these patterns.
   7153   def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))),
   7154             (BLENDPSrri (v4f32 (V_SET0)), VR128:$src, (i8 1))>;
   7155   def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))),
   7156             (PBLENDWrri (v4i32 (V_SET0)), VR128:$src, (i8 3))>;
   7157 }
   7158 
   7159 
   7160 /// SS41I_ternary_int - SSE 4.1 ternary operator
   7161 let Uses = [XMM0], Constraints = "$src1 = $dst" in {
   7162   multiclass SS41I_ternary_int<bits<8> opc, string OpcodeStr, PatFrag mem_frag,
   7163                                X86MemOperand x86memop, Intrinsic IntId,
   7164                                OpndItins itins = DEFAULT_ITINS> {
   7165     def rr0 : SS48I<opc, MRMSrcReg, (outs VR128:$dst),
   7166                     (ins VR128:$src1, VR128:$src2),
   7167                     !strconcat(OpcodeStr,
   7168                      "\t{$src2, $dst|$dst, $src2}"),
   7169                     [(set VR128:$dst, (IntId VR128:$src1, VR128:$src2, XMM0))],
   7170                     itins.rr>, Sched<[itins.Sched]>;
   7171 
   7172     def rm0 : SS48I<opc, MRMSrcMem, (outs VR128:$dst),
   7173                     (ins VR128:$src1, x86memop:$src2),
   7174                     !strconcat(OpcodeStr,
   7175                      "\t{$src2, $dst|$dst, $src2}"),
   7176                     [(set VR128:$dst,
   7177                       (IntId VR128:$src1,
   7178                        (bitconvert (mem_frag addr:$src2)), XMM0))],
   7179                        itins.rm>, Sched<[itins.Sched.Folded, ReadAfterLd]>;
   7180   }
   7181 }
   7182 
   7183 let ExeDomain = SSEPackedDouble in
   7184 defm BLENDVPD : SS41I_ternary_int<0x15, "blendvpd", memopv2f64, f128mem,
   7185                                   int_x86_sse41_blendvpd,
   7186                                   DEFAULT_ITINS_FBLENDSCHED>;
   7187 let ExeDomain = SSEPackedSingle in
   7188 defm BLENDVPS : SS41I_ternary_int<0x14, "blendvps", memopv4f32, f128mem,
   7189                                   int_x86_sse41_blendvps,
   7190                                   DEFAULT_ITINS_FBLENDSCHED>;
   7191 defm PBLENDVB : SS41I_ternary_int<0x10, "pblendvb", memopv2i64, i128mem,
   7192                                   int_x86_sse41_pblendvb,
   7193                                   DEFAULT_ITINS_VARBLENDSCHED>;
   7194 
   7195 // Aliases with the implicit xmm0 argument
   7196 def : InstAlias<"blendvpd\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}",
   7197                 (BLENDVPDrr0 VR128:$dst, VR128:$src2)>;
   7198 def : InstAlias<"blendvpd\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}",
   7199                 (BLENDVPDrm0 VR128:$dst, f128mem:$src2)>;
   7200 def : InstAlias<"blendvps\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}",
   7201                 (BLENDVPSrr0 VR128:$dst, VR128:$src2)>;
   7202 def : InstAlias<"blendvps\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}",
   7203                 (BLENDVPSrm0 VR128:$dst, f128mem:$src2)>;
   7204 def : InstAlias<"pblendvb\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}",
   7205                 (PBLENDVBrr0 VR128:$dst, VR128:$src2)>;
   7206 def : InstAlias<"pblendvb\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}",
   7207                 (PBLENDVBrm0 VR128:$dst, i128mem:$src2)>;
   7208 
   7209 let Predicates = [UseSSE41] in {
   7210   def : Pat<(v16i8 (vselect (v16i8 XMM0), (v16i8 VR128:$src1),
   7211                             (v16i8 VR128:$src2))),
   7212             (PBLENDVBrr0 VR128:$src2, VR128:$src1)>;
   7213   def : Pat<(v4i32 (vselect (v4i32 XMM0), (v4i32 VR128:$src1),
   7214                             (v4i32 VR128:$src2))),
   7215             (BLENDVPSrr0 VR128:$src2, VR128:$src1)>;
   7216   def : Pat<(v4f32 (vselect (v4i32 XMM0), (v4f32 VR128:$src1),
   7217                             (v4f32 VR128:$src2))),
   7218             (BLENDVPSrr0 VR128:$src2, VR128:$src1)>;
   7219   def : Pat<(v2i64 (vselect (v2i64 XMM0), (v2i64 VR128:$src1),
   7220                             (v2i64 VR128:$src2))),
   7221             (BLENDVPDrr0 VR128:$src2, VR128:$src1)>;
   7222   def : Pat<(v2f64 (vselect (v2i64 XMM0), (v2f64 VR128:$src1),
   7223                             (v2f64 VR128:$src2))),
   7224             (BLENDVPDrr0 VR128:$src2, VR128:$src1)>;
   7225 }
   7226 
   7227 let AddedComplexity = 400 in { // Prefer non-temporal versions
   7228 let SchedRW = [WriteLoad] in {
   7229 let Predicates = [HasAVX, NoVLX] in
   7230 def VMOVNTDQArm : SS48I<0x2A, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
   7231                        "vmovntdqa\t{$src, $dst|$dst, $src}",
   7232                        [(set VR128:$dst, (int_x86_sse41_movntdqa addr:$src))]>,
   7233                        VEX;
   7234 let Predicates = [HasAVX2, NoVLX] in
   7235 def VMOVNTDQAYrm : SS48I<0x2A, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src),
   7236                          "vmovntdqa\t{$src, $dst|$dst, $src}",
   7237                          [(set VR256:$dst, (int_x86_avx2_movntdqa addr:$src))]>,
   7238                          VEX, VEX_L;
   7239 def MOVNTDQArm : SS48I<0x2A, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
   7240                        "movntdqa\t{$src, $dst|$dst, $src}",
   7241                        [(set VR128:$dst, (int_x86_sse41_movntdqa addr:$src))]>;
   7242 } // SchedRW
   7243 
   7244 let Predicates = [HasAVX2, NoVLX] in {
   7245   def : Pat<(v8f32 (alignednontemporalload addr:$src)),
   7246             (VMOVNTDQAYrm addr:$src)>;
   7247   def : Pat<(v4f64 (alignednontemporalload addr:$src)),
   7248             (VMOVNTDQAYrm addr:$src)>;
   7249   def : Pat<(v4i64 (alignednontemporalload addr:$src)),
   7250             (VMOVNTDQAYrm addr:$src)>;
   7251 }
   7252 
   7253 let Predicates = [HasAVX, NoVLX] in {
   7254   def : Pat<(v4f32 (alignednontemporalload addr:$src)),
   7255             (VMOVNTDQArm addr:$src)>;
   7256   def : Pat<(v2f64 (alignednontemporalload addr:$src)),
   7257             (VMOVNTDQArm addr:$src)>;
   7258   def : Pat<(v2i64 (alignednontemporalload addr:$src)),
   7259             (VMOVNTDQArm addr:$src)>;
   7260 }
   7261 
   7262 let Predicates = [UseSSE41] in {
   7263   def : Pat<(v4f32 (alignednontemporalload addr:$src)),
   7264             (MOVNTDQArm addr:$src)>;
   7265   def : Pat<(v2f64 (alignednontemporalload addr:$src)),
   7266             (MOVNTDQArm addr:$src)>;
   7267   def : Pat<(v2i64 (alignednontemporalload addr:$src)),
   7268             (MOVNTDQArm addr:$src)>;
   7269 }
   7270 
   7271 } // AddedComplexity
   7272 
   7273 //===----------------------------------------------------------------------===//
   7274 // SSE4.2 - Compare Instructions
   7275 //===----------------------------------------------------------------------===//
   7276 
   7277 /// SS42I_binop_rm - Simple SSE 4.2 binary operator
   7278 multiclass SS42I_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
   7279                           ValueType OpVT, RegisterClass RC, PatFrag memop_frag,
   7280                           X86MemOperand x86memop, bit Is2Addr = 1> {
   7281   def rr : SS428I<opc, MRMSrcReg, (outs RC:$dst),
   7282        (ins RC:$src1, RC:$src2),
   7283        !if(Is2Addr,
   7284            !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
   7285            !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
   7286        [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2)))]>;
   7287   def rm : SS428I<opc, MRMSrcMem, (outs RC:$dst),
   7288        (ins RC:$src1, x86memop:$src2),
   7289        !if(Is2Addr,
   7290            !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
   7291            !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
   7292        [(set RC:$dst,
   7293          (OpVT (OpNode RC:$src1, (memop_frag addr:$src2))))]>;
   7294 }
   7295 
   7296 let Predicates = [HasAVX] in
   7297   defm VPCMPGTQ : SS42I_binop_rm<0x37, "vpcmpgtq", X86pcmpgt, v2i64, VR128,
   7298                                  loadv2i64, i128mem, 0>, VEX_4V;
   7299 
   7300 let Predicates = [HasAVX2] in
   7301   defm VPCMPGTQY : SS42I_binop_rm<0x37, "vpcmpgtq", X86pcmpgt, v4i64, VR256,
   7302                                   loadv4i64, i256mem, 0>, VEX_4V, VEX_L;
   7303 
   7304 let Constraints = "$src1 = $dst" in
   7305   defm PCMPGTQ : SS42I_binop_rm<0x37, "pcmpgtq", X86pcmpgt, v2i64, VR128,
   7306                                 memopv2i64, i128mem>;
   7307 
   7308 //===----------------------------------------------------------------------===//
   7309 // SSE4.2 - String/text Processing Instructions
   7310 //===----------------------------------------------------------------------===//
   7311 
   7312 // Packed Compare Implicit Length Strings, Return Mask
   7313 multiclass pseudo_pcmpistrm<string asm, PatFrag ld_frag> {
   7314   def REG : PseudoI<(outs VR128:$dst),
   7315                     (ins VR128:$src1, VR128:$src2, u8imm:$src3),
   7316     [(set VR128:$dst, (int_x86_sse42_pcmpistrm128 VR128:$src1, VR128:$src2,
   7317                                                   imm:$src3))]>;
   7318   def MEM : PseudoI<(outs VR128:$dst),
   7319                     (ins VR128:$src1, i128mem:$src2, u8imm:$src3),
   7320     [(set VR128:$dst, (int_x86_sse42_pcmpistrm128 VR128:$src1,
   7321                        (bc_v16i8 (ld_frag addr:$src2)), imm:$src3))]>;
   7322 }
   7323 
   7324 let Defs = [EFLAGS], usesCustomInserter = 1 in {
   7325   defm VPCMPISTRM128 : pseudo_pcmpistrm<"#VPCMPISTRM128", loadv2i64>,
   7326                          Requires<[HasAVX]>;
   7327   defm PCMPISTRM128 : pseudo_pcmpistrm<"#PCMPISTRM128", memopv2i64>,
   7328                          Requires<[UseSSE42]>;
   7329 }
   7330 
   7331 multiclass pcmpistrm_SS42AI<string asm> {
   7332   def rr : SS42AI<0x62, MRMSrcReg, (outs),
   7333     (ins VR128:$src1, VR128:$src2, u8imm:$src3),
   7334     !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"),
   7335     []>, Sched<[WritePCmpIStrM]>;
   7336   let mayLoad = 1 in
   7337   def rm :SS42AI<0x62, MRMSrcMem, (outs),
   7338     (ins VR128:$src1, i128mem:$src2, u8imm:$src3),
   7339     !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"),
   7340     []>, Sched<[WritePCmpIStrMLd, ReadAfterLd]>;
   7341 }
   7342 
   7343 let Defs = [XMM0, EFLAGS], hasSideEffects = 0 in {
   7344   let Predicates = [HasAVX] in
   7345   defm VPCMPISTRM128 : pcmpistrm_SS42AI<"vpcmpistrm">, VEX;
   7346   defm PCMPISTRM128  : pcmpistrm_SS42AI<"pcmpistrm"> ;
   7347 }
   7348 
   7349 // Packed Compare Explicit Length Strings, Return Mask
   7350 multiclass pseudo_pcmpestrm<string asm, PatFrag ld_frag> {
   7351   def REG : PseudoI<(outs VR128:$dst),
   7352                     (ins VR128:$src1, VR128:$src3, u8imm:$src5),
   7353     [(set VR128:$dst, (int_x86_sse42_pcmpestrm128
   7354                        VR128:$src1, EAX, VR128:$src3, EDX, imm:$src5))]>;
   7355   def MEM : PseudoI<(outs VR128:$dst),
   7356                     (ins VR128:$src1, i128mem:$src3, u8imm:$src5),
   7357     [(set VR128:$dst, (int_x86_sse42_pcmpestrm128 VR128:$src1, EAX,
   7358                        (bc_v16i8 (ld_frag addr:$src3)), EDX, imm:$src5))]>;
   7359 }
   7360 
   7361 let Defs = [EFLAGS], Uses = [EAX, EDX], usesCustomInserter = 1 in {
   7362   defm VPCMPESTRM128 : pseudo_pcmpestrm<"#VPCMPESTRM128", loadv2i64>,
   7363                          Requires<[HasAVX]>;
   7364   defm PCMPESTRM128 : pseudo_pcmpestrm<"#PCMPESTRM128", memopv2i64>,
   7365                          Requires<[UseSSE42]>;
   7366 }
   7367 
   7368 multiclass SS42AI_pcmpestrm<string asm> {
   7369   def rr : SS42AI<0x60, MRMSrcReg, (outs),
   7370     (ins VR128:$src1, VR128:$src3, u8imm:$src5),
   7371     !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"),
   7372     []>, Sched<[WritePCmpEStrM]>;
   7373   let mayLoad = 1 in
   7374   def rm : SS42AI<0x60, MRMSrcMem, (outs),
   7375     (ins VR128:$src1, i128mem:$src3, u8imm:$src5),
   7376     !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"),
   7377     []>, Sched<[WritePCmpEStrMLd, ReadAfterLd]>;
   7378 }
   7379 
   7380 let Defs = [XMM0, EFLAGS], Uses = [EAX, EDX], hasSideEffects = 0 in {
   7381   let Predicates = [HasAVX] in
   7382   defm VPCMPESTRM128 : SS42AI_pcmpestrm<"vpcmpestrm">, VEX;
   7383   defm PCMPESTRM128 :  SS42AI_pcmpestrm<"pcmpestrm">;
   7384 }
   7385 
   7386 // Packed Compare Implicit Length Strings, Return Index
   7387 multiclass pseudo_pcmpistri<string asm, PatFrag ld_frag> {
   7388   def REG : PseudoI<(outs GR32:$dst),
   7389                     (ins VR128:$src1, VR128:$src2, u8imm:$src3),
   7390     [(set GR32:$dst, EFLAGS,
   7391       (X86pcmpistri VR128:$src1, VR128:$src2, imm:$src3))]>;
   7392   def MEM : PseudoI<(outs GR32:$dst),
   7393                     (ins VR128:$src1, i128mem:$src2, u8imm:$src3),
   7394     [(set GR32:$dst, EFLAGS, (X86pcmpistri VR128:$src1,
   7395                               (bc_v16i8 (ld_frag addr:$src2)), imm:$src3))]>;
   7396 }
   7397 
   7398 let Defs = [EFLAGS], usesCustomInserter = 1 in {
   7399   defm VPCMPISTRI : pseudo_pcmpistri<"#VPCMPISTRI", loadv2i64>,
   7400                       Requires<[HasAVX]>;
   7401   defm PCMPISTRI  : pseudo_pcmpistri<"#PCMPISTRI", memopv2i64>,
   7402                       Requires<[UseSSE42]>;
   7403 }
   7404 
   7405 multiclass SS42AI_pcmpistri<string asm> {
   7406   def rr : SS42AI<0x63, MRMSrcReg, (outs),
   7407     (ins VR128:$src1, VR128:$src2, u8imm:$src3),
   7408     !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"),
   7409     []>, Sched<[WritePCmpIStrI]>;
   7410   let mayLoad = 1 in
   7411   def rm : SS42AI<0x63, MRMSrcMem, (outs),
   7412     (ins VR128:$src1, i128mem:$src2, u8imm:$src3),
   7413     !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"),
   7414     []>, Sched<[WritePCmpIStrILd, ReadAfterLd]>;
   7415 }
   7416 
   7417 let Defs = [ECX, EFLAGS], hasSideEffects = 0 in {
   7418   let Predicates = [HasAVX] in
   7419   defm VPCMPISTRI : SS42AI_pcmpistri<"vpcmpistri">, VEX;
   7420   defm PCMPISTRI  : SS42AI_pcmpistri<"pcmpistri">;
   7421 }
   7422 
   7423 // Packed Compare Explicit Length Strings, Return Index
   7424 multiclass pseudo_pcmpestri<string asm, PatFrag ld_frag> {
   7425   def REG : PseudoI<(outs GR32:$dst),
   7426                     (ins VR128:$src1, VR128:$src3, u8imm:$src5),
   7427     [(set GR32:$dst, EFLAGS,
   7428       (X86pcmpestri VR128:$src1, EAX, VR128:$src3, EDX, imm:$src5))]>;
   7429   def MEM : PseudoI<(outs GR32:$dst),
   7430                     (ins VR128:$src1, i128mem:$src3, u8imm:$src5),
   7431     [(set GR32:$dst, EFLAGS,
   7432       (X86pcmpestri VR128:$src1, EAX, (bc_v16i8 (ld_frag addr:$src3)), EDX,
   7433        imm:$src5))]>;
   7434 }
   7435 
   7436 let Defs = [EFLAGS], Uses = [EAX, EDX], usesCustomInserter = 1 in {
   7437   defm VPCMPESTRI : pseudo_pcmpestri<"#VPCMPESTRI", loadv2i64>,
   7438                       Requires<[HasAVX]>;
   7439   defm PCMPESTRI  : pseudo_pcmpestri<"#PCMPESTRI", memopv2i64>,
   7440                       Requires<[UseSSE42]>;
   7441 }
   7442 
   7443 multiclass SS42AI_pcmpestri<string asm> {
   7444   def rr : SS42AI<0x61, MRMSrcReg, (outs),
   7445     (ins VR128:$src1, VR128:$src3, u8imm:$src5),
   7446     !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"),
   7447     []>, Sched<[WritePCmpEStrI]>;
   7448   let mayLoad = 1 in
   7449   def rm : SS42AI<0x61, MRMSrcMem, (outs),
   7450     (ins VR128:$src1, i128mem:$src3, u8imm:$src5),
   7451     !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"),
   7452     []>, Sched<[WritePCmpEStrILd, ReadAfterLd]>;
   7453 }
   7454 
   7455 let Defs = [ECX, EFLAGS], Uses = [EAX, EDX], hasSideEffects = 0 in {
   7456   let Predicates = [HasAVX] in
   7457   defm VPCMPESTRI : SS42AI_pcmpestri<"vpcmpestri">, VEX;
   7458   defm PCMPESTRI  : SS42AI_pcmpestri<"pcmpestri">;
   7459 }
   7460 
   7461 //===----------------------------------------------------------------------===//
   7462 // SSE4.2 - CRC Instructions
   7463 //===----------------------------------------------------------------------===//
   7464 
   7465 // No CRC instructions have AVX equivalents
   7466 
   7467 // crc intrinsic instruction
   7468 // This set of instructions are only rm, the only difference is the size
   7469 // of r and m.
   7470 class SS42I_crc32r<bits<8> opc, string asm, RegisterClass RCOut,
   7471                    RegisterClass RCIn, SDPatternOperator Int> :
   7472   SS42FI<opc, MRMSrcReg, (outs RCOut:$dst), (ins RCOut:$src1, RCIn:$src2),
   7473          !strconcat(asm, "\t{$src2, $src1|$src1, $src2}"),
   7474          [(set RCOut:$dst, (Int RCOut:$src1, RCIn:$src2))], IIC_CRC32_REG>,
   7475          Sched<[WriteFAdd]>;
   7476 
   7477 class SS42I_crc32m<bits<8> opc, string asm, RegisterClass RCOut,
   7478                    X86MemOperand x86memop, SDPatternOperator Int> :
   7479   SS42FI<opc, MRMSrcMem, (outs RCOut:$dst), (ins RCOut:$src1, x86memop:$src2),
   7480          !strconcat(asm, "\t{$src2, $src1|$src1, $src2}"),
   7481          [(set RCOut:$dst, (Int RCOut:$src1, (load addr:$src2)))],
   7482          IIC_CRC32_MEM>, Sched<[WriteFAddLd, ReadAfterLd]>;
   7483 
   7484 let Constraints = "$src1 = $dst" in {
   7485   def CRC32r32m8  : SS42I_crc32m<0xF0, "crc32{b}", GR32, i8mem,
   7486                                  int_x86_sse42_crc32_32_8>;
   7487   def CRC32r32r8  : SS42I_crc32r<0xF0, "crc32{b}", GR32, GR8,
   7488                                  int_x86_sse42_crc32_32_8>;
   7489   def CRC32r32m16 : SS42I_crc32m<0xF1, "crc32{w}", GR32, i16mem,
   7490                                  int_x86_sse42_crc32_32_16>, OpSize16;
   7491   def CRC32r32r16 : SS42I_crc32r<0xF1, "crc32{w}", GR32, GR16,
   7492                                  int_x86_sse42_crc32_32_16>, OpSize16;
   7493   def CRC32r32m32 : SS42I_crc32m<0xF1, "crc32{l}", GR32, i32mem,
   7494                                  int_x86_sse42_crc32_32_32>, OpSize32;
   7495   def CRC32r32r32 : SS42I_crc32r<0xF1, "crc32{l}", GR32, GR32,
   7496                                  int_x86_sse42_crc32_32_32>, OpSize32;
   7497   def CRC32r64m64 : SS42I_crc32m<0xF1, "crc32{q}", GR64, i64mem,
   7498                                  int_x86_sse42_crc32_64_64>, REX_W;
   7499   def CRC32r64r64 : SS42I_crc32r<0xF1, "crc32{q}", GR64, GR64,
   7500                                  int_x86_sse42_crc32_64_64>, REX_W;
   7501   let hasSideEffects = 0 in {
   7502     let mayLoad = 1 in
   7503     def CRC32r64m8 : SS42I_crc32m<0xF0, "crc32{b}", GR64, i8mem,
   7504                                    null_frag>, REX_W;
   7505     def CRC32r64r8 : SS42I_crc32r<0xF0, "crc32{b}", GR64, GR8,
   7506                                    null_frag>, REX_W;
   7507   }
   7508 }
   7509 
   7510 //===----------------------------------------------------------------------===//
   7511 // SHA-NI Instructions
   7512 //===----------------------------------------------------------------------===//
   7513 
   7514 multiclass SHAI_binop<bits<8> Opc, string OpcodeStr, Intrinsic IntId,
   7515                       bit UsesXMM0 = 0> {
   7516   def rr : I<Opc, MRMSrcReg, (outs VR128:$dst),
   7517              (ins VR128:$src1, VR128:$src2),
   7518              !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
   7519              [!if(UsesXMM0,
   7520                   (set VR128:$dst, (IntId VR128:$src1, VR128:$src2, XMM0)),
   7521                   (set VR128:$dst, (IntId VR128:$src1, VR128:$src2)))]>, T8;
   7522 
   7523   def rm : I<Opc, MRMSrcMem, (outs VR128:$dst),
   7524              (ins VR128:$src1, i128mem:$src2),
   7525              !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
   7526              [!if(UsesXMM0,
   7527                   (set VR128:$dst, (IntId VR128:$src1,
   7528                     (bc_v4i32 (memopv2i64 addr:$src2)), XMM0)),
   7529                   (set VR128:$dst, (IntId VR128:$src1,
   7530                     (bc_v4i32 (memopv2i64 addr:$src2)))))]>, T8;
   7531 }
   7532 
   7533 let Constraints = "$src1 = $dst", Predicates = [HasSHA] in {
   7534   def SHA1RNDS4rri : Ii8<0xCC, MRMSrcReg, (outs VR128:$dst),
   7535                          (ins VR128:$src1, VR128:$src2, u8imm:$src3),
   7536                          "sha1rnds4\t{$src3, $src2, $dst|$dst, $src2, $src3}",
   7537                          [(set VR128:$dst,
   7538                            (int_x86_sha1rnds4 VR128:$src1, VR128:$src2,
   7539                             (i8 imm:$src3)))]>, TA;
   7540   def SHA1RNDS4rmi : Ii8<0xCC, MRMSrcMem, (outs VR128:$dst),
   7541                          (ins VR128:$src1, i128mem:$src2, u8imm:$src3),
   7542                          "sha1rnds4\t{$src3, $src2, $dst|$dst, $src2, $src3}",
   7543                          [(set VR128:$dst,
   7544                            (int_x86_sha1rnds4 VR128:$src1,
   7545                             (bc_v4i32 (memopv2i64 addr:$src2)),
   7546                             (i8 imm:$src3)))]>, TA;
   7547 
   7548   defm SHA1NEXTE : SHAI_binop<0xC8, "sha1nexte", int_x86_sha1nexte>;
   7549   defm SHA1MSG1  : SHAI_binop<0xC9, "sha1msg1", int_x86_sha1msg1>;
   7550   defm SHA1MSG2  : SHAI_binop<0xCA, "sha1msg2", int_x86_sha1msg2>;
   7551 
   7552   let Uses=[XMM0] in
   7553   defm SHA256RNDS2 : SHAI_binop<0xCB, "sha256rnds2", int_x86_sha256rnds2, 1>;
   7554 
   7555   defm SHA256MSG1 : SHAI_binop<0xCC, "sha256msg1", int_x86_sha256msg1>;
   7556   defm SHA256MSG2 : SHAI_binop<0xCD, "sha256msg2", int_x86_sha256msg2>;
   7557 }
   7558 
   7559 // Aliases with explicit %xmm0
   7560 def : InstAlias<"sha256rnds2\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}",
   7561                 (SHA256RNDS2rr VR128:$dst, VR128:$src2)>;
   7562 def : InstAlias<"sha256rnds2\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}",
   7563                 (SHA256RNDS2rm VR128:$dst, i128mem:$src2)>;
   7564 
   7565 //===----------------------------------------------------------------------===//
   7566 // AES-NI Instructions
   7567 //===----------------------------------------------------------------------===//
   7568 
   7569 multiclass AESI_binop_rm_int<bits<8> opc, string OpcodeStr, Intrinsic IntId128,
   7570                              PatFrag ld_frag, bit Is2Addr = 1> {
   7571   def rr : AES8I<opc, MRMSrcReg, (outs VR128:$dst),
   7572        (ins VR128:$src1, VR128:$src2),
   7573        !if(Is2Addr,
   7574            !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
   7575            !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
   7576        [(set VR128:$dst, (IntId128 VR128:$src1, VR128:$src2))]>,
   7577        Sched<[WriteAESDecEnc]>;
   7578   def rm : AES8I<opc, MRMSrcMem, (outs VR128:$dst),
   7579        (ins VR128:$src1, i128mem:$src2),
   7580        !if(Is2Addr,
   7581            !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
   7582            !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
   7583        [(set VR128:$dst,
   7584          (IntId128 VR128:$src1, (ld_frag addr:$src2)))]>,
   7585        Sched<[WriteAESDecEncLd, ReadAfterLd]>;
   7586 }
   7587 
   7588 // Perform One Round of an AES Encryption/Decryption Flow
   7589 let Predicates = [HasAVX, HasAES] in {
   7590   defm VAESENC          : AESI_binop_rm_int<0xDC, "vaesenc",
   7591                          int_x86_aesni_aesenc, loadv2i64, 0>, VEX_4V;
   7592   defm VAESENCLAST      : AESI_binop_rm_int<0xDD, "vaesenclast",
   7593                          int_x86_aesni_aesenclast, loadv2i64, 0>, VEX_4V;
   7594   defm VAESDEC          : AESI_binop_rm_int<0xDE, "vaesdec",
   7595                          int_x86_aesni_aesdec, loadv2i64, 0>, VEX_4V;
   7596   defm VAESDECLAST      : AESI_binop_rm_int<0xDF, "vaesdeclast",
   7597                          int_x86_aesni_aesdeclast, loadv2i64, 0>, VEX_4V;
   7598 }
   7599 
   7600 let Constraints = "$src1 = $dst" in {
   7601   defm AESENC          : AESI_binop_rm_int<0xDC, "aesenc",
   7602                          int_x86_aesni_aesenc, memopv2i64>;
   7603   defm AESENCLAST      : AESI_binop_rm_int<0xDD, "aesenclast",
   7604                          int_x86_aesni_aesenclast, memopv2i64>;
   7605   defm AESDEC          : AESI_binop_rm_int<0xDE, "aesdec",
   7606                          int_x86_aesni_aesdec, memopv2i64>;
   7607   defm AESDECLAST      : AESI_binop_rm_int<0xDF, "aesdeclast",
   7608                          int_x86_aesni_aesdeclast, memopv2i64>;
   7609 }
   7610 
   7611 // Perform the AES InvMixColumn Transformation
   7612 let Predicates = [HasAVX, HasAES] in {
   7613   def VAESIMCrr : AES8I<0xDB, MRMSrcReg, (outs VR128:$dst),
   7614       (ins VR128:$src1),
   7615       "vaesimc\t{$src1, $dst|$dst, $src1}",
   7616       [(set VR128:$dst,
   7617         (int_x86_aesni_aesimc VR128:$src1))]>, Sched<[WriteAESIMC]>,
   7618       VEX;
   7619   def VAESIMCrm : AES8I<0xDB, MRMSrcMem, (outs VR128:$dst),
   7620       (ins i128mem:$src1),
   7621       "vaesimc\t{$src1, $dst|$dst, $src1}",
   7622       [(set VR128:$dst, (int_x86_aesni_aesimc (loadv2i64 addr:$src1)))]>,
   7623       Sched<[WriteAESIMCLd]>, VEX;
   7624 }
   7625 def AESIMCrr : AES8I<0xDB, MRMSrcReg, (outs VR128:$dst),
   7626   (ins VR128:$src1),
   7627   "aesimc\t{$src1, $dst|$dst, $src1}",
   7628   [(set VR128:$dst,
   7629     (int_x86_aesni_aesimc VR128:$src1))]>, Sched<[WriteAESIMC]>;
   7630 def AESIMCrm : AES8I<0xDB, MRMSrcMem, (outs VR128:$dst),
   7631   (ins i128mem:$src1),
   7632   "aesimc\t{$src1, $dst|$dst, $src1}",
   7633   [(set VR128:$dst, (int_x86_aesni_aesimc (memopv2i64 addr:$src1)))]>,
   7634   Sched<[WriteAESIMCLd]>;
   7635 
   7636 // AES Round Key Generation Assist
   7637 let Predicates = [HasAVX, HasAES] in {
   7638   def VAESKEYGENASSIST128rr : AESAI<0xDF, MRMSrcReg, (outs VR128:$dst),
   7639       (ins VR128:$src1, u8imm:$src2),
   7640       "vaeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}",
   7641       [(set VR128:$dst,
   7642         (int_x86_aesni_aeskeygenassist VR128:$src1, imm:$src2))]>,
   7643       Sched<[WriteAESKeyGen]>, VEX;
   7644   def VAESKEYGENASSIST128rm : AESAI<0xDF, MRMSrcMem, (outs VR128:$dst),
   7645       (ins i128mem:$src1, u8imm:$src2),
   7646       "vaeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}",
   7647       [(set VR128:$dst,
   7648         (int_x86_aesni_aeskeygenassist (loadv2i64 addr:$src1), imm:$src2))]>,
   7649       Sched<[WriteAESKeyGenLd]>, VEX;
   7650 }
   7651 def AESKEYGENASSIST128rr : AESAI<0xDF, MRMSrcReg, (outs VR128:$dst),
   7652   (ins VR128:$src1, u8imm:$src2),
   7653   "aeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}",
   7654   [(set VR128:$dst,
   7655     (int_x86_aesni_aeskeygenassist VR128:$src1, imm:$src2))]>,
   7656   Sched<[WriteAESKeyGen]>;
   7657 def AESKEYGENASSIST128rm : AESAI<0xDF, MRMSrcMem, (outs VR128:$dst),
   7658   (ins i128mem:$src1, u8imm:$src2),
   7659   "aeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}",
   7660   [(set VR128:$dst,
   7661     (int_x86_aesni_aeskeygenassist (memopv2i64 addr:$src1), imm:$src2))]>,
   7662   Sched<[WriteAESKeyGenLd]>;
   7663 
   7664 //===----------------------------------------------------------------------===//
   7665 // PCLMUL Instructions
   7666 //===----------------------------------------------------------------------===//
   7667 
   7668 // AVX carry-less Multiplication instructions
   7669 let isCommutable = 1 in
   7670 def VPCLMULQDQrr : AVXPCLMULIi8<0x44, MRMSrcReg, (outs VR128:$dst),
   7671            (ins VR128:$src1, VR128:$src2, u8imm:$src3),
   7672            "vpclmulqdq\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
   7673            [(set VR128:$dst,
   7674              (int_x86_pclmulqdq VR128:$src1, VR128:$src2, imm:$src3))]>,
   7675            Sched<[WriteCLMul]>;
   7676 
   7677 def VPCLMULQDQrm : AVXPCLMULIi8<0x44, MRMSrcMem, (outs VR128:$dst),
   7678            (ins VR128:$src1, i128mem:$src2, u8imm:$src3),
   7679            "vpclmulqdq\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
   7680            [(set VR128:$dst, (int_x86_pclmulqdq VR128:$src1,
   7681                               (loadv2i64 addr:$src2), imm:$src3))]>,
   7682            Sched<[WriteCLMulLd, ReadAfterLd]>;
   7683 
   7684 // Carry-less Multiplication instructions
   7685 let Constraints = "$src1 = $dst" in {
   7686 let isCommutable = 1 in
   7687 def PCLMULQDQrr : PCLMULIi8<0x44, MRMSrcReg, (outs VR128:$dst),
   7688            (ins VR128:$src1, VR128:$src2, u8imm:$src3),
   7689            "pclmulqdq\t{$src3, $src2, $dst|$dst, $src2, $src3}",
   7690            [(set VR128:$dst,
   7691              (int_x86_pclmulqdq VR128:$src1, VR128:$src2, imm:$src3))],
   7692              IIC_SSE_PCLMULQDQ_RR>, Sched<[WriteCLMul]>;
   7693 
   7694 def PCLMULQDQrm : PCLMULIi8<0x44, MRMSrcMem, (outs VR128:$dst),
   7695            (ins VR128:$src1, i128mem:$src2, u8imm:$src3),
   7696            "pclmulqdq\t{$src3, $src2, $dst|$dst, $src2, $src3}",
   7697            [(set VR128:$dst, (int_x86_pclmulqdq VR128:$src1,
   7698                               (memopv2i64 addr:$src2), imm:$src3))],
   7699                               IIC_SSE_PCLMULQDQ_RM>,
   7700            Sched<[WriteCLMulLd, ReadAfterLd]>;
   7701 } // Constraints = "$src1 = $dst"
   7702 
   7703 
   7704 multiclass pclmul_alias<string asm, int immop> {
   7705   def : InstAlias<!strconcat("pclmul", asm, "dq {$src, $dst|$dst, $src}"),
   7706                   (PCLMULQDQrr VR128:$dst, VR128:$src, immop), 0>;
   7707 
   7708   def : InstAlias<!strconcat("pclmul", asm, "dq {$src, $dst|$dst, $src}"),
   7709                   (PCLMULQDQrm VR128:$dst, i128mem:$src, immop), 0>;
   7710 
   7711   def : InstAlias<!strconcat("vpclmul", asm,
   7712                              "dq {$src2, $src1, $dst|$dst, $src1, $src2}"),
   7713                   (VPCLMULQDQrr VR128:$dst, VR128:$src1, VR128:$src2, immop),
   7714                   0>;
   7715 
   7716   def : InstAlias<!strconcat("vpclmul", asm,
   7717                              "dq {$src2, $src1, $dst|$dst, $src1, $src2}"),
   7718                   (VPCLMULQDQrm VR128:$dst, VR128:$src1, i128mem:$src2, immop),
   7719                   0>;
   7720 }
   7721 defm : pclmul_alias<"hqhq", 0x11>;
   7722 defm : pclmul_alias<"hqlq", 0x01>;
   7723 defm : pclmul_alias<"lqhq", 0x10>;
   7724 defm : pclmul_alias<"lqlq", 0x00>;
   7725 
   7726 //===----------------------------------------------------------------------===//
   7727 // SSE4A Instructions
   7728 //===----------------------------------------------------------------------===//
   7729 
   7730 let Predicates = [HasSSE4A] in {
   7731 
   7732 let Constraints = "$src = $dst" in {
   7733 def EXTRQI : Ii8<0x78, MRMXr, (outs VR128:$dst),
   7734                  (ins VR128:$src, u8imm:$len, u8imm:$idx),
   7735                  "extrq\t{$idx, $len, $src|$src, $len, $idx}",
   7736                  [(set VR128:$dst, (X86extrqi VR128:$src, imm:$len,
   7737                                     imm:$idx))]>, PD;
   7738 def EXTRQ  : I<0x79, MRMSrcReg, (outs VR128:$dst),
   7739               (ins VR128:$src, VR128:$mask),
   7740               "extrq\t{$mask, $src|$src, $mask}",
   7741               [(set VR128:$dst, (int_x86_sse4a_extrq VR128:$src,
   7742                                  VR128:$mask))]>, PD;
   7743 
   7744 def INSERTQI : Ii8<0x78, MRMSrcReg, (outs VR128:$dst),
   7745                    (ins VR128:$src, VR128:$src2, u8imm:$len, u8imm:$idx),
   7746                    "insertq\t{$idx, $len, $src2, $src|$src, $src2, $len, $idx}",
   7747                    [(set VR128:$dst, (X86insertqi VR128:$src, VR128:$src2,
   7748                                       imm:$len, imm:$idx))]>, XD;
   7749 def INSERTQ  : I<0x79, MRMSrcReg, (outs VR128:$dst),
   7750                  (ins VR128:$src, VR128:$mask),
   7751                  "insertq\t{$mask, $src|$src, $mask}",
   7752                  [(set VR128:$dst, (int_x86_sse4a_insertq VR128:$src,
   7753                                     VR128:$mask))]>, XD;
   7754 }
   7755 
   7756 // Non-temporal (unaligned) scalar stores.
   7757 let AddedComplexity = 400 in { // Prefer non-temporal versions
   7758 let mayStore = 1, SchedRW = [WriteStore] in {
   7759 def MOVNTSS : I<0x2B, MRMDestMem, (outs), (ins f32mem:$dst, VR128:$src),
   7760                 "movntss\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVNT>, XS;
   7761 
   7762 def MOVNTSD : I<0x2B, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
   7763                 "movntsd\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVNT>, XD;
   7764 } // SchedRW
   7765 
   7766 def : Pat<(nontemporalstore FR32:$src, addr:$dst),
   7767           (MOVNTSS addr:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>;
   7768 
   7769 def : Pat<(nontemporalstore FR64:$src, addr:$dst),
   7770           (MOVNTSD addr:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>;
   7771 
   7772 } // AddedComplexity
   7773 } // HasSSE4A
   7774 
   7775 //===----------------------------------------------------------------------===//
   7776 // AVX Instructions
   7777 //===----------------------------------------------------------------------===//
   7778 
   7779 //===----------------------------------------------------------------------===//
   7780 // VBROADCAST - Load from memory and broadcast to all elements of the
   7781 //              destination operand
   7782 //
   7783 class avx_broadcast_rm<bits<8> opc, string OpcodeStr, RegisterClass RC,
   7784                            X86MemOperand x86memop, ValueType VT,
   7785                            PatFrag ld_frag, SchedWrite Sched> :
   7786   AVX8I<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
   7787         !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
   7788         [(set RC:$dst, (VT (X86VBroadcast (ld_frag addr:$src))))]>,
   7789         Sched<[Sched]>, VEX;
   7790 
   7791 // AVX2 adds register forms
   7792 class avx2_broadcast_rr<bits<8> opc, string OpcodeStr, RegisterClass RC,
   7793                         ValueType ResVT, ValueType OpVT, SchedWrite Sched> :
   7794   AVX28I<opc, MRMSrcReg, (outs RC:$dst), (ins VR128:$src),
   7795          !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
   7796          [(set RC:$dst, (ResVT (X86VBroadcast (OpVT VR128:$src))))]>,
   7797          Sched<[Sched]>, VEX;
   7798 
   7799 let ExeDomain = SSEPackedSingle, Predicates = [HasAVX, NoVLX] in {
   7800   def VBROADCASTSSrm  : avx_broadcast_rm<0x18, "vbroadcastss", VR128,
   7801                                              f32mem, v4f32, loadf32, WriteLoad>;
   7802   def VBROADCASTSSYrm : avx_broadcast_rm<0x18, "vbroadcastss", VR256,
   7803                                              f32mem, v8f32, loadf32,
   7804                                              WriteFShuffleLd>, VEX_L;
   7805 }
   7806 let ExeDomain = SSEPackedDouble, Predicates = [HasAVX, NoVLX] in
   7807 def VBROADCASTSDYrm  : avx_broadcast_rm<0x19, "vbroadcastsd", VR256, f64mem,
   7808                                     v4f64, loadf64, WriteFShuffleLd>, VEX_L;
   7809 
   7810 let ExeDomain = SSEPackedSingle, Predicates = [HasAVX2, NoVLX] in {
   7811   def VBROADCASTSSrr  : avx2_broadcast_rr<0x18, "vbroadcastss", VR128,
   7812                                           v4f32, v4f32, WriteFShuffle>;
   7813   def VBROADCASTSSYrr : avx2_broadcast_rr<0x18, "vbroadcastss", VR256,
   7814                                           v8f32, v4f32, WriteFShuffle256>, VEX_L;
   7815 }
   7816 let ExeDomain = SSEPackedDouble, Predicates = [HasAVX2, NoVLX] in
   7817 def VBROADCASTSDYrr  : avx2_broadcast_rr<0x19, "vbroadcastsd", VR256,
   7818                                          v4f64, v2f64, WriteFShuffle256>, VEX_L;
   7819 
   7820 let mayLoad = 1, hasSideEffects = 0, Predicates = [HasAVX2] in
   7821 def VBROADCASTI128 : AVX8I<0x5A, MRMSrcMem, (outs VR256:$dst),
   7822                            (ins i128mem:$src),
   7823                            "vbroadcasti128\t{$src, $dst|$dst, $src}", []>,
   7824                            Sched<[WriteLoad]>, VEX, VEX_L;
   7825 
   7826 def VBROADCASTF128 : AVX8I<0x1A, MRMSrcMem, (outs VR256:$dst),
   7827                            (ins f128mem:$src),
   7828                            "vbroadcastf128\t{$src, $dst|$dst, $src}",
   7829                            [(set VR256:$dst,
   7830                               (int_x86_avx_vbroadcastf128_pd_256 addr:$src))]>,
   7831                            Sched<[WriteFShuffleLd]>, VEX, VEX_L;
   7832 
   7833 let Predicates = [HasAVX] in
   7834 def : Pat<(int_x86_avx_vbroadcastf128_ps_256 addr:$src),
   7835           (VBROADCASTF128 addr:$src)>;
   7836 
   7837 
   7838 //===----------------------------------------------------------------------===//
   7839 // VINSERTF128 - Insert packed floating-point values
   7840 //
   7841 let hasSideEffects = 0, ExeDomain = SSEPackedSingle in {
   7842 def VINSERTF128rr : AVXAIi8<0x18, MRMSrcReg, (outs VR256:$dst),
   7843           (ins VR256:$src1, VR128:$src2, u8imm:$src3),
   7844           "vinsertf128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
   7845           []>, Sched<[WriteFShuffle]>, VEX_4V, VEX_L;
   7846 let mayLoad = 1 in
   7847 def VINSERTF128rm : AVXAIi8<0x18, MRMSrcMem, (outs VR256:$dst),
   7848           (ins VR256:$src1, f128mem:$src2, u8imm:$src3),
   7849           "vinsertf128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
   7850           []>, Sched<[WriteFShuffleLd, ReadAfterLd]>, VEX_4V, VEX_L;
   7851 }
   7852 
   7853 let Predicates = [HasAVX, NoVLX] in {
   7854 def : Pat<(vinsert128_insert:$ins (v8f32 VR256:$src1), (v4f32 VR128:$src2),
   7855                                    (iPTR imm)),
   7856           (VINSERTF128rr VR256:$src1, VR128:$src2,
   7857                          (INSERT_get_vinsert128_imm VR256:$ins))>;
   7858 def : Pat<(vinsert128_insert:$ins (v4f64 VR256:$src1), (v2f64 VR128:$src2),
   7859                                    (iPTR imm)),
   7860           (VINSERTF128rr VR256:$src1, VR128:$src2,
   7861                          (INSERT_get_vinsert128_imm VR256:$ins))>;
   7862 
   7863 def : Pat<(vinsert128_insert:$ins (v8f32 VR256:$src1), (loadv4f32 addr:$src2),
   7864                                    (iPTR imm)),
   7865           (VINSERTF128rm VR256:$src1, addr:$src2,
   7866                          (INSERT_get_vinsert128_imm VR256:$ins))>;
   7867 def : Pat<(vinsert128_insert:$ins (v4f64 VR256:$src1), (loadv2f64 addr:$src2),
   7868                                    (iPTR imm)),
   7869           (VINSERTF128rm VR256:$src1, addr:$src2,
   7870                          (INSERT_get_vinsert128_imm VR256:$ins))>;
   7871 }
   7872 
   7873 let Predicates = [HasAVX1Only] in {
   7874 def : Pat<(vinsert128_insert:$ins (v4i64 VR256:$src1), (v2i64 VR128:$src2),
   7875                                    (iPTR imm)),
   7876           (VINSERTF128rr VR256:$src1, VR128:$src2,
   7877                          (INSERT_get_vinsert128_imm VR256:$ins))>;
   7878 def : Pat<(vinsert128_insert:$ins (v8i32 VR256:$src1), (v4i32 VR128:$src2),
   7879                                    (iPTR imm)),
   7880           (VINSERTF128rr VR256:$src1, VR128:$src2,
   7881                          (INSERT_get_vinsert128_imm VR256:$ins))>;
   7882 def : Pat<(vinsert128_insert:$ins (v32i8 VR256:$src1), (v16i8 VR128:$src2),
   7883                                    (iPTR imm)),
   7884           (VINSERTF128rr VR256:$src1, VR128:$src2,
   7885                          (INSERT_get_vinsert128_imm VR256:$ins))>;
   7886 def : Pat<(vinsert128_insert:$ins (v16i16 VR256:$src1), (v8i16 VR128:$src2),
   7887                                    (iPTR imm)),
   7888           (VINSERTF128rr VR256:$src1, VR128:$src2,
   7889                          (INSERT_get_vinsert128_imm VR256:$ins))>;
   7890 
   7891 def : Pat<(vinsert128_insert:$ins (v4i64 VR256:$src1), (loadv2i64 addr:$src2),
   7892                                    (iPTR imm)),
   7893           (VINSERTF128rm VR256:$src1, addr:$src2,
   7894                          (INSERT_get_vinsert128_imm VR256:$ins))>;
   7895 def : Pat<(vinsert128_insert:$ins (v8i32 VR256:$src1),
   7896                                    (bc_v4i32 (loadv2i64 addr:$src2)),
   7897                                    (iPTR imm)),
   7898           (VINSERTF128rm VR256:$src1, addr:$src2,
   7899                          (INSERT_get_vinsert128_imm VR256:$ins))>;
   7900 def : Pat<(vinsert128_insert:$ins (v32i8 VR256:$src1),
   7901                                    (bc_v16i8 (loadv2i64 addr:$src2)),
   7902                                    (iPTR imm)),
   7903           (VINSERTF128rm VR256:$src1, addr:$src2,
   7904                          (INSERT_get_vinsert128_imm VR256:$ins))>;
   7905 def : Pat<(vinsert128_insert:$ins (v16i16 VR256:$src1),
   7906                                    (bc_v8i16 (loadv2i64 addr:$src2)),
   7907                                    (iPTR imm)),
   7908           (VINSERTF128rm VR256:$src1, addr:$src2,
   7909                          (INSERT_get_vinsert128_imm VR256:$ins))>;
   7910 }
   7911 
   7912 //===----------------------------------------------------------------------===//
   7913 // VEXTRACTF128 - Extract packed floating-point values
   7914 //
   7915 let hasSideEffects = 0, ExeDomain = SSEPackedSingle in {
   7916 def VEXTRACTF128rr : AVXAIi8<0x19, MRMDestReg, (outs VR128:$dst),
   7917           (ins VR256:$src1, u8imm:$src2),
   7918           "vextractf128\t{$src2, $src1, $dst|$dst, $src1, $src2}",
   7919           []>, Sched<[WriteFShuffle]>, VEX, VEX_L;
   7920 let mayStore = 1 in
   7921 def VEXTRACTF128mr : AVXAIi8<0x19, MRMDestMem, (outs),
   7922           (ins f128mem:$dst, VR256:$src1, u8imm:$src2),
   7923           "vextractf128\t{$src2, $src1, $dst|$dst, $src1, $src2}",
   7924           []>, Sched<[WriteStore]>, VEX, VEX_L;
   7925 }
   7926 
   7927 // AVX1 patterns
   7928 let Predicates = [HasAVX, NoVLX] in {
   7929 def : Pat<(vextract128_extract:$ext VR256:$src1, (iPTR imm)),
   7930           (v4f32 (VEXTRACTF128rr
   7931                     (v8f32 VR256:$src1),
   7932                     (EXTRACT_get_vextract128_imm VR128:$ext)))>;
   7933 def : Pat<(vextract128_extract:$ext VR256:$src1, (iPTR imm)),
   7934           (v2f64 (VEXTRACTF128rr
   7935                     (v4f64 VR256:$src1),
   7936                     (EXTRACT_get_vextract128_imm VR128:$ext)))>;
   7937 
   7938 def : Pat<(store (v4f32 (vextract128_extract:$ext (v8f32 VR256:$src1),
   7939                          (iPTR imm))), addr:$dst),
   7940           (VEXTRACTF128mr addr:$dst, VR256:$src1,
   7941            (EXTRACT_get_vextract128_imm VR128:$ext))>;
   7942 def : Pat<(store (v2f64 (vextract128_extract:$ext (v4f64 VR256:$src1),
   7943                          (iPTR imm))), addr:$dst),
   7944           (VEXTRACTF128mr addr:$dst, VR256:$src1,
   7945            (EXTRACT_get_vextract128_imm VR128:$ext))>;
   7946 }
   7947 
   7948 let Predicates = [HasAVX1Only] in {
   7949 def : Pat<(vextract128_extract:$ext VR256:$src1, (iPTR imm)),
   7950           (v2i64 (VEXTRACTF128rr
   7951                   (v4i64 VR256:$src1),
   7952                   (EXTRACT_get_vextract128_imm VR128:$ext)))>;
   7953 def : Pat<(vextract128_extract:$ext VR256:$src1, (iPTR imm)),
   7954           (v4i32 (VEXTRACTF128rr
   7955                   (v8i32 VR256:$src1),
   7956                   (EXTRACT_get_vextract128_imm VR128:$ext)))>;
   7957 def : Pat<(vextract128_extract:$ext VR256:$src1, (iPTR imm)),
   7958           (v8i16 (VEXTRACTF128rr
   7959                   (v16i16 VR256:$src1),
   7960                   (EXTRACT_get_vextract128_imm VR128:$ext)))>;
   7961 def : Pat<(vextract128_extract:$ext VR256:$src1, (iPTR imm)),
   7962           (v16i8 (VEXTRACTF128rr
   7963                   (v32i8 VR256:$src1),
   7964                   (EXTRACT_get_vextract128_imm VR128:$ext)))>;
   7965 
   7966 def : Pat<(store (v2i64 (vextract128_extract:$ext (v4i64 VR256:$src1),
   7967                          (iPTR imm))), addr:$dst),
   7968           (VEXTRACTF128mr addr:$dst, VR256:$src1,
   7969            (EXTRACT_get_vextract128_imm VR128:$ext))>;
   7970 def : Pat<(store (v4i32 (vextract128_extract:$ext (v8i32 VR256:$src1),
   7971                          (iPTR imm))), addr:$dst),
   7972           (VEXTRACTF128mr addr:$dst, VR256:$src1,
   7973            (EXTRACT_get_vextract128_imm VR128:$ext))>;
   7974 def : Pat<(store (v8i16 (vextract128_extract:$ext (v16i16 VR256:$src1),
   7975                          (iPTR imm))), addr:$dst),
   7976           (VEXTRACTF128mr addr:$dst, VR256:$src1,
   7977            (EXTRACT_get_vextract128_imm VR128:$ext))>;
   7978 def : Pat<(store (v16i8 (vextract128_extract:$ext (v32i8 VR256:$src1),
   7979                          (iPTR imm))), addr:$dst),
   7980           (VEXTRACTF128mr addr:$dst, VR256:$src1,
   7981            (EXTRACT_get_vextract128_imm VR128:$ext))>;
   7982 }
   7983 
   7984 //===----------------------------------------------------------------------===//
   7985 // VMASKMOV - Conditional SIMD Packed Loads and Stores
   7986 //
   7987 multiclass avx_movmask_rm<bits<8> opc_rm, bits<8> opc_mr, string OpcodeStr,
   7988                           Intrinsic IntLd, Intrinsic IntLd256,
   7989                           Intrinsic IntSt, Intrinsic IntSt256> {
   7990   def rm  : AVX8I<opc_rm, MRMSrcMem, (outs VR128:$dst),
   7991              (ins VR128:$src1, f128mem:$src2),
   7992              !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
   7993              [(set VR128:$dst, (IntLd addr:$src2, VR128:$src1))]>,
   7994              VEX_4V;
   7995   def Yrm : AVX8I<opc_rm, MRMSrcMem, (outs VR256:$dst),
   7996              (ins VR256:$src1, f256mem:$src2),
   7997              !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
   7998              [(set VR256:$dst, (IntLd256 addr:$src2, VR256:$src1))]>,
   7999              VEX_4V, VEX_L;
   8000   def mr  : AVX8I<opc_mr, MRMDestMem, (outs),
   8001              (ins f128mem:$dst, VR128:$src1, VR128:$src2),
   8002              !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
   8003              [(IntSt addr:$dst, VR128:$src1, VR128:$src2)]>, VEX_4V;
   8004   def Ymr : AVX8I<opc_mr, MRMDestMem, (outs),
   8005              (ins f256mem:$dst, VR256:$src1, VR256:$src2),
   8006              !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
   8007              [(IntSt256 addr:$dst, VR256:$src1, VR256:$src2)]>, VEX_4V, VEX_L;
   8008 }
   8009 
   8010 let ExeDomain = SSEPackedSingle in
   8011 defm VMASKMOVPS : avx_movmask_rm<0x2C, 0x2E, "vmaskmovps",
   8012                                  int_x86_avx_maskload_ps,
   8013                                  int_x86_avx_maskload_ps_256,
   8014                                  int_x86_avx_maskstore_ps,
   8015                                  int_x86_avx_maskstore_ps_256>;
   8016 let ExeDomain = SSEPackedDouble in
   8017 defm VMASKMOVPD : avx_movmask_rm<0x2D, 0x2F, "vmaskmovpd",
   8018                                  int_x86_avx_maskload_pd,
   8019                                  int_x86_avx_maskload_pd_256,
   8020                                  int_x86_avx_maskstore_pd,
   8021                                  int_x86_avx_maskstore_pd_256>;
   8022 
   8023 //===----------------------------------------------------------------------===//
   8024 // VPERMIL - Permute Single and Double Floating-Point Values
   8025 //
   8026 multiclass avx_permil<bits<8> opc_rm, bits<8> opc_rmi, string OpcodeStr,
   8027                       RegisterClass RC, X86MemOperand x86memop_f,
   8028                       X86MemOperand x86memop_i, PatFrag i_frag,
   8029                       ValueType f_vt, ValueType i_vt> {
   8030   let Predicates = [HasAVX, NoVLX] in {
   8031     def rr  : AVX8I<opc_rm, MRMSrcReg, (outs RC:$dst),
   8032                (ins RC:$src1, RC:$src2),
   8033                !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
   8034                [(set RC:$dst, (f_vt (X86VPermilpv RC:$src1, (i_vt RC:$src2))))]>, VEX_4V,
   8035                Sched<[WriteFShuffle]>;
   8036     def rm  : AVX8I<opc_rm, MRMSrcMem, (outs RC:$dst),
   8037                (ins RC:$src1, x86memop_i:$src2),
   8038                !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
   8039                [(set RC:$dst, (f_vt (X86VPermilpv RC:$src1,
   8040                               (i_vt (bitconvert (i_frag addr:$src2))))))]>, VEX_4V,
   8041                Sched<[WriteFShuffleLd, ReadAfterLd]>;
   8042 
   8043     def ri  : AVXAIi8<opc_rmi, MRMSrcReg, (outs RC:$dst),
   8044              (ins RC:$src1, u8imm:$src2),
   8045              !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
   8046              [(set RC:$dst, (f_vt (X86VPermilpi RC:$src1, (i8 imm:$src2))))]>, VEX,
   8047              Sched<[WriteFShuffle]>;
   8048     def mi  : AVXAIi8<opc_rmi, MRMSrcMem, (outs RC:$dst),
   8049              (ins x86memop_f:$src1, u8imm:$src2),
   8050              !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
   8051              [(set RC:$dst,
   8052                (f_vt (X86VPermilpi (load addr:$src1), (i8 imm:$src2))))]>, VEX,
   8053              Sched<[WriteFShuffleLd]>;
   8054   }// Predicates = [HasAVX, NoVLX]
   8055 }
   8056 
   8057 let ExeDomain = SSEPackedSingle in {
   8058   defm VPERMILPS  : avx_permil<0x0C, 0x04, "vpermilps", VR128, f128mem, i128mem,
   8059                                loadv2i64, v4f32, v4i32>;
   8060   defm VPERMILPSY : avx_permil<0x0C, 0x04, "vpermilps", VR256, f256mem, i256mem,
   8061                                loadv4i64, v8f32, v8i32>, VEX_L;
   8062 }
   8063 let ExeDomain = SSEPackedDouble in {
   8064   defm VPERMILPD  : avx_permil<0x0D, 0x05, "vpermilpd", VR128, f128mem, i128mem,
   8065                                loadv2i64, v2f64, v2i64>;
   8066   defm VPERMILPDY : avx_permil<0x0D, 0x05, "vpermilpd", VR256, f256mem, i256mem,
   8067                                loadv4i64, v4f64, v4i64>, VEX_L;
   8068 }
   8069 
   8070 let Predicates = [HasAVX, NoVLX] in {
   8071 def : Pat<(v8f32 (X86VPermilpv VR256:$src1, (v8i32 VR256:$src2))),
   8072           (VPERMILPSYrr VR256:$src1, VR256:$src2)>;
   8073 def : Pat<(v8f32 (X86VPermilpv VR256:$src1, (bc_v8i32 (loadv4i64 addr:$src2)))),
   8074           (VPERMILPSYrm VR256:$src1, addr:$src2)>;
   8075 def : Pat<(v4f64 (X86VPermilpv VR256:$src1, (v4i64 VR256:$src2))),
   8076           (VPERMILPDYrr VR256:$src1, VR256:$src2)>;
   8077 def : Pat<(v4f64 (X86VPermilpv VR256:$src1, (loadv4i64 addr:$src2))),
   8078           (VPERMILPDYrm VR256:$src1, addr:$src2)>;
   8079 
   8080 def : Pat<(v8i32 (X86VPermilpi VR256:$src1, (i8 imm:$imm))),
   8081           (VPERMILPSYri VR256:$src1, imm:$imm)>;
   8082 def : Pat<(v4i64 (X86VPermilpi VR256:$src1, (i8 imm:$imm))),
   8083           (VPERMILPDYri VR256:$src1, imm:$imm)>;
   8084 def : Pat<(v8i32 (X86VPermilpi (bc_v8i32 (loadv4i64 addr:$src1)),
   8085                                (i8 imm:$imm))),
   8086           (VPERMILPSYmi addr:$src1, imm:$imm)>;
   8087 def : Pat<(v4i64 (X86VPermilpi (loadv4i64 addr:$src1), (i8 imm:$imm))),
   8088           (VPERMILPDYmi addr:$src1, imm:$imm)>;
   8089 
   8090 def : Pat<(v4f32 (X86VPermilpv VR128:$src1, (v4i32 VR128:$src2))),
   8091           (VPERMILPSrr VR128:$src1, VR128:$src2)>;
   8092 def : Pat<(v4f32 (X86VPermilpv VR128:$src1, (bc_v4i32 (loadv2i64 addr:$src2)))),
   8093           (VPERMILPSrm VR128:$src1, addr:$src2)>;
   8094 def : Pat<(v2f64 (X86VPermilpv VR128:$src1, (v2i64 VR128:$src2))),
   8095           (VPERMILPDrr VR128:$src1, VR128:$src2)>;
   8096 def : Pat<(v2f64 (X86VPermilpv VR128:$src1, (loadv2i64 addr:$src2))),
   8097           (VPERMILPDrm VR128:$src1, addr:$src2)>;
   8098 
   8099 def : Pat<(v2i64 (X86VPermilpi VR128:$src1, (i8 imm:$imm))),
   8100           (VPERMILPDri VR128:$src1, imm:$imm)>;
   8101 def : Pat<(v2i64 (X86VPermilpi (loadv2i64 addr:$src1), (i8 imm:$imm))),
   8102           (VPERMILPDmi addr:$src1, imm:$imm)>;
   8103 }
   8104 
   8105 //===----------------------------------------------------------------------===//
   8106 // VPERM2F128 - Permute Floating-Point Values in 128-bit chunks
   8107 //
   8108 let ExeDomain = SSEPackedSingle in {
   8109 let isCommutable = 1 in
   8110 def VPERM2F128rr : AVXAIi8<0x06, MRMSrcReg, (outs VR256:$dst),
   8111           (ins VR256:$src1, VR256:$src2, u8imm:$src3),
   8112           "vperm2f128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
   8113           [(set VR256:$dst, (v8f32 (X86VPerm2x128 VR256:$src1, VR256:$src2,
   8114                               (i8 imm:$src3))))]>, VEX_4V, VEX_L,
   8115           Sched<[WriteFShuffle]>;
   8116 def VPERM2F128rm : AVXAIi8<0x06, MRMSrcMem, (outs VR256:$dst),
   8117           (ins VR256:$src1, f256mem:$src2, u8imm:$src3),
   8118           "vperm2f128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
   8119           [(set VR256:$dst, (X86VPerm2x128 VR256:$src1, (loadv8f32 addr:$src2),
   8120                              (i8 imm:$src3)))]>, VEX_4V, VEX_L,
   8121           Sched<[WriteFShuffleLd, ReadAfterLd]>;
   8122 }
   8123 
   8124 let Predicates = [HasAVX] in {
   8125 def : Pat<(v4f64 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))),
   8126           (VPERM2F128rr VR256:$src1, VR256:$src2, imm:$imm)>;
   8127 def : Pat<(v4f64 (X86VPerm2x128 VR256:$src1,
   8128                   (loadv4f64 addr:$src2), (i8 imm:$imm))),
   8129           (VPERM2F128rm VR256:$src1, addr:$src2, imm:$imm)>;
   8130 }
   8131 
   8132 let Predicates = [HasAVX1Only] in {
   8133 def : Pat<(v8i32 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))),
   8134           (VPERM2F128rr VR256:$src1, VR256:$src2, imm:$imm)>;
   8135 def : Pat<(v4i64 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))),
   8136           (VPERM2F128rr VR256:$src1, VR256:$src2, imm:$imm)>;
   8137 def : Pat<(v32i8 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))),
   8138           (VPERM2F128rr VR256:$src1, VR256:$src2, imm:$imm)>;
   8139 def : Pat<(v16i16 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))),
   8140           (VPERM2F128rr VR256:$src1, VR256:$src2, imm:$imm)>;
   8141 
   8142 def : Pat<(v8i32 (X86VPerm2x128 VR256:$src1,
   8143                   (bc_v8i32 (loadv4i64 addr:$src2)), (i8 imm:$imm))),
   8144           (VPERM2F128rm VR256:$src1, addr:$src2, imm:$imm)>;
   8145 def : Pat<(v4i64 (X86VPerm2x128 VR256:$src1,
   8146                   (loadv4i64 addr:$src2), (i8 imm:$imm))),
   8147           (VPERM2F128rm VR256:$src1, addr:$src2, imm:$imm)>;
   8148 def : Pat<(v32i8 (X86VPerm2x128 VR256:$src1,
   8149                   (bc_v32i8 (loadv4i64 addr:$src2)), (i8 imm:$imm))),
   8150           (VPERM2F128rm VR256:$src1, addr:$src2, imm:$imm)>;
   8151 def : Pat<(v16i16 (X86VPerm2x128 VR256:$src1,
   8152                   (bc_v16i16 (loadv4i64 addr:$src2)), (i8 imm:$imm))),
   8153           (VPERM2F128rm VR256:$src1, addr:$src2, imm:$imm)>;
   8154 }
   8155 
   8156 //===----------------------------------------------------------------------===//
   8157 // VZERO - Zero YMM registers
   8158 //
   8159 let Defs = [YMM0, YMM1, YMM2, YMM3, YMM4, YMM5, YMM6, YMM7,
   8160             YMM8, YMM9, YMM10, YMM11, YMM12, YMM13, YMM14, YMM15] in {
   8161   // Zero All YMM registers
   8162   def VZEROALL : I<0x77, RawFrm, (outs), (ins), "vzeroall",
   8163                   [(int_x86_avx_vzeroall)]>, PS, VEX, VEX_L, Requires<[HasAVX]>;
   8164 
   8165   // Zero Upper bits of YMM registers
   8166   def VZEROUPPER : I<0x77, RawFrm, (outs), (ins), "vzeroupper",
   8167                      [(int_x86_avx_vzeroupper)]>, PS, VEX, Requires<[HasAVX]>;
   8168 }
   8169 
   8170 //===----------------------------------------------------------------------===//
   8171 // Half precision conversion instructions
   8172 //===----------------------------------------------------------------------===//
   8173 multiclass f16c_ph2ps<RegisterClass RC, X86MemOperand x86memop, Intrinsic Int> {
   8174   def rr : I<0x13, MRMSrcReg, (outs RC:$dst), (ins VR128:$src),
   8175              "vcvtph2ps\t{$src, $dst|$dst, $src}",
   8176              [(set RC:$dst, (Int VR128:$src))]>,
   8177              T8PD, VEX, Sched<[WriteCvtF2F]>;
   8178   let hasSideEffects = 0, mayLoad = 1 in
   8179   def rm : I<0x13, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
   8180              "vcvtph2ps\t{$src, $dst|$dst, $src}", []>, T8PD, VEX,
   8181              Sched<[WriteCvtF2FLd]>;
   8182 }
   8183 
   8184 multiclass f16c_ps2ph<RegisterClass RC, X86MemOperand x86memop, Intrinsic Int> {
   8185   def rr : Ii8<0x1D, MRMDestReg, (outs VR128:$dst),
   8186                (ins RC:$src1, i32u8imm:$src2),
   8187                "vcvtps2ph\t{$src2, $src1, $dst|$dst, $src1, $src2}",
   8188                [(set VR128:$dst, (Int RC:$src1, imm:$src2))]>,
   8189                TAPD, VEX, Sched<[WriteCvtF2F]>;
   8190   let hasSideEffects = 0, mayStore = 1,
   8191       SchedRW = [WriteCvtF2FLd, WriteRMW] in
   8192   def mr : Ii8<0x1D, MRMDestMem, (outs),
   8193                (ins x86memop:$dst, RC:$src1, i32u8imm:$src2),
   8194                "vcvtps2ph\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
   8195                TAPD, VEX;
   8196 }
   8197 
   8198 let Predicates = [HasF16C] in {
   8199   defm VCVTPH2PS  : f16c_ph2ps<VR128, f64mem, int_x86_vcvtph2ps_128>;
   8200   defm VCVTPH2PSY : f16c_ph2ps<VR256, f128mem, int_x86_vcvtph2ps_256>, VEX_L;
   8201   defm VCVTPS2PH  : f16c_ps2ph<VR128, f64mem, int_x86_vcvtps2ph_128>;
   8202   defm VCVTPS2PHY : f16c_ps2ph<VR256, f128mem, int_x86_vcvtps2ph_256>, VEX_L;
   8203 
   8204   // Pattern match vcvtph2ps of a scalar i64 load.
   8205   def : Pat<(int_x86_vcvtph2ps_128 (vzmovl_v2i64 addr:$src)),
   8206             (VCVTPH2PSrm addr:$src)>;
   8207   def : Pat<(int_x86_vcvtph2ps_128 (vzload_v2i64 addr:$src)),
   8208             (VCVTPH2PSrm addr:$src)>;
   8209   def : Pat<(int_x86_vcvtph2ps_128 (bitconvert
   8210               (v2i64 (scalar_to_vector (loadi64 addr:$src))))),
   8211             (VCVTPH2PSrm addr:$src)>;
   8212 
   8213   def : Pat<(store (f64 (extractelt (bc_v2f64 (v8i16
   8214                   (int_x86_vcvtps2ph_128 VR128:$src1, i32:$src2))), (iPTR 0))),
   8215                    addr:$dst),
   8216                    (VCVTPS2PHmr addr:$dst, VR128:$src1, imm:$src2)>;
   8217   def : Pat<(store (i64 (extractelt (bc_v2i64 (v8i16
   8218                   (int_x86_vcvtps2ph_128 VR128:$src1, i32:$src2))), (iPTR 0))),
   8219                    addr:$dst),
   8220                    (VCVTPS2PHmr addr:$dst, VR128:$src1, imm:$src2)>;
   8221   def : Pat<(store (v8i16 (int_x86_vcvtps2ph_256 VR256:$src1, i32:$src2)),
   8222                    addr:$dst),
   8223                    (VCVTPS2PHYmr addr:$dst, VR256:$src1, imm:$src2)>;
   8224 }
   8225 
   8226 // Patterns for  matching conversions from float to half-float and vice versa.
   8227 let Predicates = [HasF16C] in {
   8228   // Use MXCSR.RC for rounding instead of explicitly specifying the default
   8229   // rounding mode (Nearest-Even, encoded as 0). Both are equivalent in the
   8230   // configurations we support (the default). However, falling back to MXCSR is
   8231   // more consistent with other instructions, which are always controlled by it.
   8232   // It's encoded as 0b100.
   8233   def : Pat<(fp_to_f16 FR32:$src),
   8234             (i16 (EXTRACT_SUBREG (VMOVPDI2DIrr (VCVTPS2PHrr
   8235               (COPY_TO_REGCLASS FR32:$src, VR128), 4)), sub_16bit))>;
   8236 
   8237   def : Pat<(f16_to_fp GR16:$src),
   8238             (f32 (COPY_TO_REGCLASS (VCVTPH2PSrr
   8239               (COPY_TO_REGCLASS (MOVSX32rr16 GR16:$src), VR128)), FR32)) >;
   8240 
   8241   def : Pat<(f16_to_fp (i16 (fp_to_f16 FR32:$src))),
   8242             (f32 (COPY_TO_REGCLASS (VCVTPH2PSrr
   8243               (VCVTPS2PHrr (COPY_TO_REGCLASS FR32:$src, VR128), 4)), FR32)) >;
   8244 }
   8245 
   8246 //===----------------------------------------------------------------------===//
   8247 // AVX2 Instructions
   8248 //===----------------------------------------------------------------------===//
   8249 
   8250 /// AVX2_binop_rmi - AVX2 binary operator with 8-bit immediate
   8251 multiclass AVX2_binop_rmi<bits<8> opc, string OpcodeStr, SDNode OpNode,
   8252                           ValueType OpVT, RegisterClass RC, PatFrag memop_frag,
   8253                           X86MemOperand x86memop> {
   8254   let isCommutable = 1 in
   8255   def rri : AVX2AIi8<opc, MRMSrcReg, (outs RC:$dst),
   8256         (ins RC:$src1, RC:$src2, u8imm:$src3),
   8257         !strconcat(OpcodeStr,
   8258             "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
   8259         [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2, imm:$src3)))]>,
   8260         Sched<[WriteBlend]>, VEX_4V;
   8261   def rmi : AVX2AIi8<opc, MRMSrcMem, (outs RC:$dst),
   8262         (ins RC:$src1, x86memop:$src2, u8imm:$src3),
   8263         !strconcat(OpcodeStr,
   8264             "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
   8265         [(set RC:$dst,
   8266           (OpVT (OpNode RC:$src1,
   8267            (bitconvert (memop_frag addr:$src2)), imm:$src3)))]>,
   8268         Sched<[WriteBlendLd, ReadAfterLd]>, VEX_4V;
   8269 }
   8270 
   8271 defm VPBLENDD : AVX2_binop_rmi<0x02, "vpblendd", X86Blendi, v4i32,
   8272                                VR128, loadv2i64, i128mem>;
   8273 defm VPBLENDDY : AVX2_binop_rmi<0x02, "vpblendd", X86Blendi, v8i32,
   8274                                 VR256, loadv4i64, i256mem>, VEX_L;
   8275 
   8276 //===----------------------------------------------------------------------===//
   8277 // VPBROADCAST - Load from memory and broadcast to all elements of the
   8278 //               destination operand
   8279 //
   8280 multiclass avx2_broadcast<bits<8> opc, string OpcodeStr,
   8281                           X86MemOperand x86memop, PatFrag ld_frag,
   8282                           ValueType OpVT128, ValueType OpVT256, Predicate prd> {
   8283   let Predicates = [HasAVX2, prd] in {
   8284     def rr : AVX28I<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
   8285                   !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
   8286                   [(set VR128:$dst,
   8287                    (OpVT128 (X86VBroadcast (OpVT128 VR128:$src))))]>,
   8288                   Sched<[WriteShuffle]>, VEX;
   8289     def rm : AVX28I<opc, MRMSrcMem, (outs VR128:$dst), (ins x86memop:$src),
   8290                   !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
   8291                   [(set VR128:$dst,
   8292                    (OpVT128 (X86VBroadcast (ld_frag addr:$src))))]>,
   8293                   Sched<[WriteLoad]>, VEX;
   8294     def Yrr : AVX28I<opc, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src),
   8295                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
   8296                    [(set VR256:$dst,
   8297                     (OpVT256 (X86VBroadcast (OpVT128 VR128:$src))))]>,
   8298                    Sched<[WriteShuffle256]>, VEX, VEX_L;
   8299     def Yrm : AVX28I<opc, MRMSrcMem, (outs VR256:$dst), (ins x86memop:$src),
   8300                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
   8301                    [(set VR256:$dst,
   8302                     (OpVT256 (X86VBroadcast (ld_frag addr:$src))))]>,
   8303                    Sched<[WriteLoad]>, VEX, VEX_L;
   8304 
   8305     // Provide aliases for broadcast from the same register class that
   8306     // automatically does the extract.
   8307     def : Pat<(OpVT256 (X86VBroadcast (OpVT256 VR256:$src))),
   8308               (!cast<Instruction>(NAME#"Yrr")
   8309                   (OpVT128 (EXTRACT_SUBREG (OpVT256 VR256:$src),sub_xmm)))>;
   8310   }
   8311 }
   8312 
   8313 defm VPBROADCASTB  : avx2_broadcast<0x78, "vpbroadcastb", i8mem, loadi8,
   8314                                     v16i8, v32i8, NoVLX_Or_NoBWI>;
   8315 defm VPBROADCASTW  : avx2_broadcast<0x79, "vpbroadcastw", i16mem, loadi16,
   8316                                     v8i16, v16i16, NoVLX_Or_NoBWI>;
   8317 defm VPBROADCASTD  : avx2_broadcast<0x58, "vpbroadcastd", i32mem, loadi32,
   8318                                     v4i32, v8i32, NoVLX>;
   8319 defm VPBROADCASTQ  : avx2_broadcast<0x59, "vpbroadcastq", i64mem, loadi64,
   8320                                     v2i64, v4i64, NoVLX>;
   8321 
   8322 let Predicates = [HasAVX2] in {
   8323   // loadi16 is tricky to fold, because !isTypeDesirableForOp, justifiably.
   8324   // This means we'll encounter truncated i32 loads; match that here.
   8325   def : Pat<(v8i16 (X86VBroadcast (i16 (trunc (i32 (load addr:$src)))))),
   8326             (VPBROADCASTWrm addr:$src)>;
   8327   def : Pat<(v16i16 (X86VBroadcast (i16 (trunc (i32 (load addr:$src)))))),
   8328             (VPBROADCASTWYrm addr:$src)>;
   8329   def : Pat<(v8i16 (X86VBroadcast
   8330               (i16 (trunc (i32 (zextloadi16 addr:$src)))))),
   8331             (VPBROADCASTWrm addr:$src)>;
   8332   def : Pat<(v16i16 (X86VBroadcast
   8333               (i16 (trunc (i32 (zextloadi16 addr:$src)))))),
   8334             (VPBROADCASTWYrm addr:$src)>;
   8335 
   8336   // Provide aliases for broadcast from the same register class that
   8337   // automatically does the extract.
   8338   def : Pat<(v8f32 (X86VBroadcast (v8f32 VR256:$src))),
   8339             (VBROADCASTSSYrr (v4f32 (EXTRACT_SUBREG (v8f32 VR256:$src),
   8340                                                     sub_xmm)))>;
   8341   def : Pat<(v4f64 (X86VBroadcast (v4f64 VR256:$src))),
   8342             (VBROADCASTSDYrr (v2f64 (EXTRACT_SUBREG (v4f64 VR256:$src),
   8343                                                     sub_xmm)))>;
   8344 }
   8345 
   8346 let Predicates = [HasAVX2, NoVLX] in {
   8347   // Provide fallback in case the load node that is used in the patterns above
   8348   // is used by additional users, which prevents the pattern selection.
   8349     let AddedComplexity = 20 in {
   8350     def : Pat<(v4f32 (X86VBroadcast FR32:$src)),
   8351               (VBROADCASTSSrr (COPY_TO_REGCLASS FR32:$src, VR128))>;
   8352     def : Pat<(v8f32 (X86VBroadcast FR32:$src)),
   8353               (VBROADCASTSSYrr (COPY_TO_REGCLASS FR32:$src, VR128))>;
   8354     def : Pat<(v4f64 (X86VBroadcast FR64:$src)),
   8355               (VBROADCASTSDYrr (COPY_TO_REGCLASS FR64:$src, VR128))>;
   8356     }
   8357 }
   8358 
   8359 let Predicates = [HasAVX2, NoVLX_Or_NoBWI], AddedComplexity = 20 in {
   8360   def : Pat<(v16i8 (X86VBroadcast GR8:$src)),
   8361         (VPBROADCASTBrr (COPY_TO_REGCLASS
   8362                          (i32 (SUBREG_TO_REG (i32 0), GR8:$src, sub_8bit)),
   8363                          VR128))>;
   8364   def : Pat<(v32i8 (X86VBroadcast GR8:$src)),
   8365         (VPBROADCASTBYrr (COPY_TO_REGCLASS
   8366                           (i32 (SUBREG_TO_REG (i32 0), GR8:$src, sub_8bit)),
   8367                           VR128))>;
   8368 
   8369   def : Pat<(v8i16 (X86VBroadcast GR16:$src)),
   8370         (VPBROADCASTWrr (COPY_TO_REGCLASS
   8371                          (i32 (SUBREG_TO_REG (i32 0), GR16:$src, sub_16bit)),
   8372                          VR128))>;
   8373   def : Pat<(v16i16 (X86VBroadcast GR16:$src)),
   8374         (VPBROADCASTWYrr (COPY_TO_REGCLASS
   8375                           (i32 (SUBREG_TO_REG (i32 0), GR16:$src, sub_16bit)),
   8376                           VR128))>;
   8377 }
   8378 let Predicates = [HasAVX2, NoVLX], AddedComplexity = 20 in {
   8379   def : Pat<(v4i32 (X86VBroadcast GR32:$src)),
   8380             (VBROADCASTSSrr (COPY_TO_REGCLASS GR32:$src, VR128))>;
   8381   def : Pat<(v8i32 (X86VBroadcast GR32:$src)),
   8382             (VBROADCASTSSYrr (COPY_TO_REGCLASS GR32:$src, VR128))>;
   8383   def : Pat<(v4i64 (X86VBroadcast GR64:$src)),
   8384             (VBROADCASTSDYrr (COPY_TO_REGCLASS GR64:$src, VR128))>;
   8385 
   8386   // The patterns for VPBROADCASTD are not needed because they would match
   8387   // the exact same thing as VBROADCASTSS patterns.
   8388 
   8389   def : Pat<(v2i64 (X86VBroadcast GR64:$src)),
   8390         (VPBROADCASTQrr (COPY_TO_REGCLASS GR64:$src, VR128))>;
   8391   // The v4i64 pattern is not needed because VBROADCASTSDYrr already match.
   8392 }
   8393 
   8394 // AVX1 broadcast patterns
   8395 let Predicates = [HasAVX1Only] in {
   8396 def : Pat<(v8i32 (X86VBroadcast (loadi32 addr:$src))),
   8397           (VBROADCASTSSYrm addr:$src)>;
   8398 def : Pat<(v4i64 (X86VBroadcast (loadi64 addr:$src))),
   8399           (VBROADCASTSDYrm addr:$src)>;
   8400 def : Pat<(v4i32 (X86VBroadcast (loadi32 addr:$src))),
   8401           (VBROADCASTSSrm addr:$src)>;
   8402 }
   8403 
   8404   // Provide fallback in case the load node that is used in the patterns above
   8405   // is used by additional users, which prevents the pattern selection.
   8406 let Predicates = [HasAVX], AddedComplexity = 20 in {
   8407   // 128bit broadcasts:
   8408   def : Pat<(v2f64 (X86VBroadcast f64:$src)),
   8409             (VMOVDDUPrr (COPY_TO_REGCLASS FR64:$src, VR128))>;
   8410 }
   8411 
   8412 let Predicates = [HasAVX, NoVLX], AddedComplexity = 20 in {
   8413   def : Pat<(v4f32 (X86VBroadcast FR32:$src)),
   8414             (VPSHUFDri (COPY_TO_REGCLASS FR32:$src, VR128), 0)>;
   8415   def : Pat<(v8f32 (X86VBroadcast FR32:$src)),
   8416             (VINSERTF128rr (INSERT_SUBREG (v8f32 (IMPLICIT_DEF)),
   8417               (VPSHUFDri (COPY_TO_REGCLASS FR32:$src, VR128), 0), sub_xmm),
   8418               (VPSHUFDri (COPY_TO_REGCLASS FR32:$src, VR128), 0), 1)>;
   8419   def : Pat<(v4f64 (X86VBroadcast FR64:$src)),
   8420             (VINSERTF128rr (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)),
   8421               (VPSHUFDri (COPY_TO_REGCLASS FR64:$src, VR128), 0x44), sub_xmm),
   8422               (VPSHUFDri (COPY_TO_REGCLASS FR64:$src, VR128), 0x44), 1)>;
   8423 
   8424   def : Pat<(v4i32 (X86VBroadcast GR32:$src)),
   8425             (VPSHUFDri (COPY_TO_REGCLASS GR32:$src, VR128), 0)>;
   8426   def : Pat<(v8i32 (X86VBroadcast GR32:$src)),
   8427             (VINSERTF128rr (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
   8428               (VPSHUFDri (COPY_TO_REGCLASS GR32:$src, VR128), 0), sub_xmm),
   8429               (VPSHUFDri (COPY_TO_REGCLASS GR32:$src, VR128), 0), 1)>;
   8430   def : Pat<(v4i64 (X86VBroadcast GR64:$src)),
   8431             (VINSERTF128rr (INSERT_SUBREG (v4i64 (IMPLICIT_DEF)),
   8432               (VPSHUFDri (COPY_TO_REGCLASS GR64:$src, VR128), 0x44), sub_xmm),
   8433               (VPSHUFDri (COPY_TO_REGCLASS GR64:$src, VR128), 0x44), 1)>;
   8434 
   8435   def : Pat<(v2i64 (X86VBroadcast i64:$src)),
   8436               (VMOVDDUPrr (COPY_TO_REGCLASS GR64:$src, VR128))>;
   8437 }
   8438 
   8439 //===----------------------------------------------------------------------===//
   8440 // VPERM - Permute instructions
   8441 //
   8442 
   8443 multiclass avx2_perm<bits<8> opc, string OpcodeStr, PatFrag mem_frag,
   8444                      ValueType OpVT, X86FoldableSchedWrite Sched> {
   8445   let Predicates = [HasAVX2, NoVLX] in {
   8446     def Yrr : AVX28I<opc, MRMSrcReg, (outs VR256:$dst),
   8447                      (ins VR256:$src1, VR256:$src2),
   8448                      !strconcat(OpcodeStr,
   8449                          "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
   8450                      [(set VR256:$dst,
   8451                        (OpVT (X86VPermv VR256:$src1, VR256:$src2)))]>,
   8452                      Sched<[Sched]>, VEX_4V, VEX_L;
   8453     def Yrm : AVX28I<opc, MRMSrcMem, (outs VR256:$dst),
   8454                      (ins VR256:$src1, i256mem:$src2),
   8455                      !strconcat(OpcodeStr,
   8456                          "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
   8457                      [(set VR256:$dst,
   8458                        (OpVT (X86VPermv VR256:$src1,
   8459                               (bitconvert (mem_frag addr:$src2)))))]>,
   8460                      Sched<[Sched.Folded, ReadAfterLd]>, VEX_4V, VEX_L;
   8461   }
   8462 }
   8463 
   8464 defm VPERMD : avx2_perm<0x36, "vpermd", loadv4i64, v8i32, WriteShuffle256>;
   8465 let ExeDomain = SSEPackedSingle in
   8466 defm VPERMPS : avx2_perm<0x16, "vpermps", loadv8f32, v8f32, WriteFShuffle256>;
   8467 
   8468 multiclass avx2_perm_imm<bits<8> opc, string OpcodeStr, PatFrag mem_frag,
   8469                          ValueType OpVT, X86FoldableSchedWrite Sched> {
   8470   let Predicates = [HasAVX2, NoVLX] in {
   8471     def Yri : AVX2AIi8<opc, MRMSrcReg, (outs VR256:$dst),
   8472                        (ins VR256:$src1, u8imm:$src2),
   8473                        !strconcat(OpcodeStr,
   8474                            "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
   8475                        [(set VR256:$dst,
   8476                          (OpVT (X86VPermi VR256:$src1, (i8 imm:$src2))))]>,
   8477                        Sched<[Sched]>, VEX, VEX_L;
   8478     def Ymi : AVX2AIi8<opc, MRMSrcMem, (outs VR256:$dst),
   8479                        (ins i256mem:$src1, u8imm:$src2),
   8480                        !strconcat(OpcodeStr,
   8481                            "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
   8482                        [(set VR256:$dst,
   8483                          (OpVT (X86VPermi (mem_frag addr:$src1),
   8484                                 (i8 imm:$src2))))]>,
   8485                        Sched<[Sched.Folded, ReadAfterLd]>, VEX, VEX_L;
   8486   }
   8487 }
   8488 
   8489 defm VPERMQ : avx2_perm_imm<0x00, "vpermq", loadv4i64, v4i64,
   8490                             WriteShuffle256>, VEX_W;
   8491 let ExeDomain = SSEPackedDouble in
   8492 defm VPERMPD : avx2_perm_imm<0x01, "vpermpd", loadv4f64, v4f64,
   8493                              WriteFShuffle256>, VEX_W;
   8494 
   8495 //===----------------------------------------------------------------------===//
   8496 // VPERM2I128 - Permute Floating-Point Values in 128-bit chunks
   8497 //
   8498 let isCommutable = 1 in
   8499 def VPERM2I128rr : AVX2AIi8<0x46, MRMSrcReg, (outs VR256:$dst),
   8500           (ins VR256:$src1, VR256:$src2, u8imm:$src3),
   8501           "vperm2i128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
   8502           [(set VR256:$dst, (v4i64 (X86VPerm2x128 VR256:$src1, VR256:$src2,
   8503                             (i8 imm:$src3))))]>, Sched<[WriteShuffle256]>,
   8504           VEX_4V, VEX_L;
   8505 def VPERM2I128rm : AVX2AIi8<0x46, MRMSrcMem, (outs VR256:$dst),
   8506           (ins VR256:$src1, f256mem:$src2, u8imm:$src3),
   8507           "vperm2i128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
   8508           [(set VR256:$dst, (X86VPerm2x128 VR256:$src1, (loadv4i64 addr:$src2),
   8509                              (i8 imm:$src3)))]>,
   8510           Sched<[WriteShuffle256Ld, ReadAfterLd]>, VEX_4V, VEX_L;
   8511 
   8512 let Predicates = [HasAVX2] in {
   8513 def : Pat<(v8i32 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))),
   8514           (VPERM2I128rr VR256:$src1, VR256:$src2, imm:$imm)>;
   8515 def : Pat<(v32i8 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))),
   8516           (VPERM2I128rr VR256:$src1, VR256:$src2, imm:$imm)>;
   8517 def : Pat<(v16i16 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))),
   8518           (VPERM2I128rr VR256:$src1, VR256:$src2, imm:$imm)>;
   8519 
   8520 def : Pat<(v32i8 (X86VPerm2x128 VR256:$src1, (bc_v32i8 (loadv4i64 addr:$src2)),
   8521                   (i8 imm:$imm))),
   8522           (VPERM2I128rm VR256:$src1, addr:$src2, imm:$imm)>;
   8523 def : Pat<(v16i16 (X86VPerm2x128 VR256:$src1,
   8524                    (bc_v16i16 (loadv4i64 addr:$src2)), (i8 imm:$imm))),
   8525           (VPERM2I128rm VR256:$src1, addr:$src2, imm:$imm)>;
   8526 def : Pat<(v8i32 (X86VPerm2x128 VR256:$src1, (bc_v8i32 (loadv4i64 addr:$src2)),
   8527                   (i8 imm:$imm))),
   8528           (VPERM2I128rm VR256:$src1, addr:$src2, imm:$imm)>;
   8529 }
   8530 
   8531 
   8532 //===----------------------------------------------------------------------===//
   8533 // VINSERTI128 - Insert packed integer values
   8534 //
   8535 let hasSideEffects = 0 in {
   8536 def VINSERTI128rr : AVX2AIi8<0x38, MRMSrcReg, (outs VR256:$dst),
   8537           (ins VR256:$src1, VR128:$src2, u8imm:$src3),
   8538           "vinserti128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
   8539           []>, Sched<[WriteShuffle256]>, VEX_4V, VEX_L;
   8540 let mayLoad = 1 in
   8541 def VINSERTI128rm : AVX2AIi8<0x38, MRMSrcMem, (outs VR256:$dst),
   8542           (ins VR256:$src1, i128mem:$src2, u8imm:$src3),
   8543           "vinserti128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
   8544           []>, Sched<[WriteShuffle256Ld, ReadAfterLd]>, VEX_4V, VEX_L;
   8545 }
   8546 
   8547 let Predicates = [HasAVX2, NoVLX] in {
   8548 def : Pat<(vinsert128_insert:$ins (v4i64 VR256:$src1), (v2i64 VR128:$src2),
   8549                                    (iPTR imm)),
   8550           (VINSERTI128rr VR256:$src1, VR128:$src2,
   8551                          (INSERT_get_vinsert128_imm VR256:$ins))>;
   8552 def : Pat<(vinsert128_insert:$ins (v8i32 VR256:$src1), (v4i32 VR128:$src2),
   8553                                    (iPTR imm)),
   8554           (VINSERTI128rr VR256:$src1, VR128:$src2,
   8555                          (INSERT_get_vinsert128_imm VR256:$ins))>;
   8556 def : Pat<(vinsert128_insert:$ins (v32i8 VR256:$src1), (v16i8 VR128:$src2),
   8557                                    (iPTR imm)),
   8558           (VINSERTI128rr VR256:$src1, VR128:$src2,
   8559                          (INSERT_get_vinsert128_imm VR256:$ins))>;
   8560 def : Pat<(vinsert128_insert:$ins (v16i16 VR256:$src1), (v8i16 VR128:$src2),
   8561                                    (iPTR imm)),
   8562           (VINSERTI128rr VR256:$src1, VR128:$src2,
   8563                          (INSERT_get_vinsert128_imm VR256:$ins))>;
   8564 
   8565 def : Pat<(vinsert128_insert:$ins (v4i64 VR256:$src1), (loadv2i64 addr:$src2),
   8566                                    (iPTR imm)),
   8567           (VINSERTI128rm VR256:$src1, addr:$src2,
   8568                          (INSERT_get_vinsert128_imm VR256:$ins))>;
   8569 def : Pat<(vinsert128_insert:$ins (v8i32 VR256:$src1),
   8570                                    (bc_v4i32 (loadv2i64 addr:$src2)),
   8571                                    (iPTR imm)),
   8572           (VINSERTI128rm VR256:$src1, addr:$src2,
   8573                          (INSERT_get_vinsert128_imm VR256:$ins))>;
   8574 def : Pat<(vinsert128_insert:$ins (v32i8 VR256:$src1),
   8575                                    (bc_v16i8 (loadv2i64 addr:$src2)),
   8576                                    (iPTR imm)),
   8577           (VINSERTI128rm VR256:$src1, addr:$src2,
   8578                          (INSERT_get_vinsert128_imm VR256:$ins))>;
   8579 def : Pat<(vinsert128_insert:$ins (v16i16 VR256:$src1),
   8580                                    (bc_v8i16 (loadv2i64 addr:$src2)),
   8581                                    (iPTR imm)),
   8582           (VINSERTI128rm VR256:$src1, addr:$src2,
   8583                          (INSERT_get_vinsert128_imm VR256:$ins))>;
   8584 }
   8585 
   8586 //===----------------------------------------------------------------------===//
   8587 // VEXTRACTI128 - Extract packed integer values
   8588 //
   8589 def VEXTRACTI128rr : AVX2AIi8<0x39, MRMDestReg, (outs VR128:$dst),
   8590           (ins VR256:$src1, u8imm:$src2),
   8591           "vextracti128\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
   8592           Sched<[WriteShuffle256]>, VEX, VEX_L;
   8593 let hasSideEffects = 0, mayStore = 1 in
   8594 def VEXTRACTI128mr : AVX2AIi8<0x39, MRMDestMem, (outs),
   8595           (ins i128mem:$dst, VR256:$src1, u8imm:$src2),
   8596           "vextracti128\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
   8597           Sched<[WriteStore]>, VEX, VEX_L;
   8598 
   8599 let Predicates = [HasAVX2, NoVLX] in {
   8600 def : Pat<(vextract128_extract:$ext VR256:$src1, (iPTR imm)),
   8601           (v2i64 (VEXTRACTI128rr
   8602                     (v4i64 VR256:$src1),
   8603                     (EXTRACT_get_vextract128_imm VR128:$ext)))>;
   8604 def : Pat<(vextract128_extract:$ext VR256:$src1, (iPTR imm)),
   8605           (v4i32 (VEXTRACTI128rr
   8606                     (v8i32 VR256:$src1),
   8607                     (EXTRACT_get_vextract128_imm VR128:$ext)))>;
   8608 def : Pat<(vextract128_extract:$ext VR256:$src1, (iPTR imm)),
   8609           (v8i16 (VEXTRACTI128rr
   8610                     (v16i16 VR256:$src1),
   8611                     (EXTRACT_get_vextract128_imm VR128:$ext)))>;
   8612 def : Pat<(vextract128_extract:$ext VR256:$src1, (iPTR imm)),
   8613           (v16i8 (VEXTRACTI128rr
   8614                     (v32i8 VR256:$src1),
   8615                     (EXTRACT_get_vextract128_imm VR128:$ext)))>;
   8616 
   8617 def : Pat<(store (v2i64 (vextract128_extract:$ext (v4i64 VR256:$src1),
   8618                          (iPTR imm))), addr:$dst),
   8619           (VEXTRACTI128mr addr:$dst, VR256:$src1,
   8620            (EXTRACT_get_vextract128_imm VR128:$ext))>;
   8621 def : Pat<(store (v4i32 (vextract128_extract:$ext (v8i32 VR256:$src1),
   8622                          (iPTR imm))), addr:$dst),
   8623           (VEXTRACTI128mr addr:$dst, VR256:$src1,
   8624            (EXTRACT_get_vextract128_imm VR128:$ext))>;
   8625 def : Pat<(store (v8i16 (vextract128_extract:$ext (v16i16 VR256:$src1),
   8626                          (iPTR imm))), addr:$dst),
   8627           (VEXTRACTI128mr addr:$dst, VR256:$src1,
   8628            (EXTRACT_get_vextract128_imm VR128:$ext))>;
   8629 def : Pat<(store (v16i8 (vextract128_extract:$ext (v32i8 VR256:$src1),
   8630                          (iPTR imm))), addr:$dst),
   8631           (VEXTRACTI128mr addr:$dst, VR256:$src1,
   8632            (EXTRACT_get_vextract128_imm VR128:$ext))>;
   8633 }
   8634 
   8635 //===----------------------------------------------------------------------===//
   8636 // VPMASKMOV - Conditional SIMD Integer Packed Loads and Stores
   8637 //
   8638 multiclass avx2_pmovmask<string OpcodeStr,
   8639                          Intrinsic IntLd128, Intrinsic IntLd256,
   8640                          Intrinsic IntSt128, Intrinsic IntSt256> {
   8641   def rm  : AVX28I<0x8c, MRMSrcMem, (outs VR128:$dst),
   8642              (ins VR128:$src1, i128mem:$src2),
   8643              !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
   8644              [(set VR128:$dst, (IntLd128 addr:$src2, VR128:$src1))]>, VEX_4V;
   8645   def Yrm : AVX28I<0x8c, MRMSrcMem, (outs VR256:$dst),
   8646              (ins VR256:$src1, i256mem:$src2),
   8647              !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
   8648              [(set VR256:$dst, (IntLd256 addr:$src2, VR256:$src1))]>,
   8649              VEX_4V, VEX_L;
   8650   def mr  : AVX28I<0x8e, MRMDestMem, (outs),
   8651              (ins i128mem:$dst, VR128:$src1, VR128:$src2),
   8652              !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
   8653              [(IntSt128 addr:$dst, VR128:$src1, VR128:$src2)]>, VEX_4V;
   8654   def Ymr : AVX28I<0x8e, MRMDestMem, (outs),
   8655              (ins i256mem:$dst, VR256:$src1, VR256:$src2),
   8656              !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
   8657              [(IntSt256 addr:$dst, VR256:$src1, VR256:$src2)]>, VEX_4V, VEX_L;
   8658 }
   8659 
   8660 defm VPMASKMOVD : avx2_pmovmask<"vpmaskmovd",
   8661                                 int_x86_avx2_maskload_d,
   8662                                 int_x86_avx2_maskload_d_256,
   8663                                 int_x86_avx2_maskstore_d,
   8664                                 int_x86_avx2_maskstore_d_256>;
   8665 defm VPMASKMOVQ : avx2_pmovmask<"vpmaskmovq",
   8666                                 int_x86_avx2_maskload_q,
   8667                                 int_x86_avx2_maskload_q_256,
   8668                                 int_x86_avx2_maskstore_q,
   8669                                 int_x86_avx2_maskstore_q_256>, VEX_W;
   8670 
   8671 multiclass maskmov_lowering<string InstrStr, RegisterClass RC, ValueType VT,
   8672                           ValueType MaskVT, string BlendStr, ValueType ZeroVT> {
   8673     // masked store
   8674     def: Pat<(X86mstore addr:$ptr, (MaskVT RC:$mask), (VT RC:$src)),
   8675              (!cast<Instruction>(InstrStr#"mr") addr:$ptr, RC:$mask, RC:$src)>;
   8676     // masked load
   8677     def: Pat<(VT (masked_load addr:$ptr, (MaskVT RC:$mask), undef)),
   8678              (!cast<Instruction>(InstrStr#"rm") RC:$mask, addr:$ptr)>;
   8679     def: Pat<(VT (masked_load addr:$ptr, (MaskVT RC:$mask),
   8680                               (VT (bitconvert (ZeroVT immAllZerosV))))),
   8681              (!cast<Instruction>(InstrStr#"rm") RC:$mask, addr:$ptr)>;
   8682     def: Pat<(VT (masked_load addr:$ptr, (MaskVT RC:$mask), (VT RC:$src0))),
   8683              (!cast<Instruction>(BlendStr#"rr")
   8684                  RC:$src0,
   8685                  (!cast<Instruction>(InstrStr#"rm") RC:$mask, addr:$ptr),
   8686                  RC:$mask)>;
   8687 }
   8688 let Predicates = [HasAVX] in {
   8689   defm : maskmov_lowering<"VMASKMOVPS", VR128, v4f32, v4i32, "VBLENDVPS", v4i32>;
   8690   defm : maskmov_lowering<"VMASKMOVPD", VR128, v2f64, v2i64, "VBLENDVPD", v4i32>;
   8691   defm : maskmov_lowering<"VMASKMOVPSY", VR256, v8f32, v8i32, "VBLENDVPSY", v8i32>;
   8692   defm : maskmov_lowering<"VMASKMOVPDY", VR256, v4f64, v4i64, "VBLENDVPDY", v8i32>;
   8693 }
   8694 let Predicates = [HasAVX1Only] in {
   8695   // load/store i32/i64 not supported use ps/pd version
   8696   defm : maskmov_lowering<"VMASKMOVPSY", VR256, v8i32, v8i32, "VBLENDVPSY", v8i32>;
   8697   defm : maskmov_lowering<"VMASKMOVPDY", VR256, v4i64, v4i64, "VBLENDVPDY", v8i32>;
   8698   defm : maskmov_lowering<"VMASKMOVPS", VR128, v4i32, v4i32, "VBLENDVPS", v4i32>;
   8699   defm : maskmov_lowering<"VMASKMOVPD", VR128, v2i64, v2i64, "VBLENDVPD", v4i32>;
   8700 }
   8701 let Predicates = [HasAVX2] in {
   8702   defm : maskmov_lowering<"VPMASKMOVDY", VR256, v8i32, v8i32, "VBLENDVPSY", v8i32>;
   8703   defm : maskmov_lowering<"VPMASKMOVQY", VR256, v4i64, v4i64, "VBLENDVPDY", v8i32>;
   8704   defm : maskmov_lowering<"VPMASKMOVD", VR128, v4i32, v4i32, "VBLENDVPS", v4i32>;
   8705   defm : maskmov_lowering<"VPMASKMOVQ", VR128, v2i64, v2i64, "VBLENDVPD", v4i32>;
   8706 }
   8707 //===----------------------------------------------------------------------===//
   8708 // Variable Bit Shifts
   8709 //
   8710 multiclass avx2_var_shift<bits<8> opc, string OpcodeStr, SDNode OpNode,
   8711                           ValueType vt128, ValueType vt256> {
   8712   def rr  : AVX28I<opc, MRMSrcReg, (outs VR128:$dst),
   8713              (ins VR128:$src1, VR128:$src2),
   8714              !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
   8715              [(set VR128:$dst,
   8716                (vt128 (OpNode VR128:$src1, (vt128 VR128:$src2))))]>,
   8717              VEX_4V, Sched<[WriteVarVecShift]>;
   8718   def rm  : AVX28I<opc, MRMSrcMem, (outs VR128:$dst),
   8719              (ins VR128:$src1, i128mem:$src2),
   8720              !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
   8721              [(set VR128:$dst,
   8722                (vt128 (OpNode VR128:$src1,
   8723                        (vt128 (bitconvert (loadv2i64 addr:$src2))))))]>,
   8724              VEX_4V, Sched<[WriteVarVecShiftLd, ReadAfterLd]>;
   8725   def Yrr : AVX28I<opc, MRMSrcReg, (outs VR256:$dst),
   8726              (ins VR256:$src1, VR256:$src2),
   8727              !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
   8728              [(set VR256:$dst,
   8729                (vt256 (OpNode VR256:$src1, (vt256 VR256:$src2))))]>,
   8730              VEX_4V, VEX_L, Sched<[WriteVarVecShift]>;
   8731   def Yrm : AVX28I<opc, MRMSrcMem, (outs VR256:$dst),
   8732              (ins VR256:$src1, i256mem:$src2),
   8733              !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
   8734              [(set VR256:$dst,
   8735                (vt256 (OpNode VR256:$src1,
   8736                        (vt256 (bitconvert (loadv4i64 addr:$src2))))))]>,
   8737              VEX_4V, VEX_L, Sched<[WriteVarVecShiftLd, ReadAfterLd]>;
   8738 }
   8739 
   8740 let Predicates = [HasAVX2, NoVLX] in {
   8741   defm VPSLLVD : avx2_var_shift<0x47, "vpsllvd", shl, v4i32, v8i32>;
   8742   defm VPSLLVQ : avx2_var_shift<0x47, "vpsllvq", shl, v2i64, v4i64>, VEX_W;
   8743   defm VPSRLVD : avx2_var_shift<0x45, "vpsrlvd", srl, v4i32, v8i32>;
   8744   defm VPSRLVQ : avx2_var_shift<0x45, "vpsrlvq", srl, v2i64, v4i64>, VEX_W;
   8745   defm VPSRAVD : avx2_var_shift<0x46, "vpsravd", sra, v4i32, v8i32>;
   8746   let isCodeGenOnly = 1 in
   8747     defm VPSRAVD_Int : avx2_var_shift<0x46, "vpsravd", X86vsrav, v4i32, v8i32>;
   8748 }
   8749 //===----------------------------------------------------------------------===//
   8750 // VGATHER - GATHER Operations
   8751 multiclass avx2_gather<bits<8> opc, string OpcodeStr, RegisterClass RC256,
   8752                        X86MemOperand memop128, X86MemOperand memop256> {
   8753   def rm  : AVX28I<opc, MRMSrcMem, (outs VR128:$dst, VR128:$mask_wb),
   8754             (ins VR128:$src1, memop128:$src2, VR128:$mask),
   8755             !strconcat(OpcodeStr,
   8756               "\t{$mask, $src2, $dst|$dst, $src2, $mask}"),
   8757             []>, VEX_4VOp3;
   8758   def Yrm : AVX28I<opc, MRMSrcMem, (outs RC256:$dst, RC256:$mask_wb),
   8759             (ins RC256:$src1, memop256:$src2, RC256:$mask),
   8760             !strconcat(OpcodeStr,
   8761               "\t{$mask, $src2, $dst|$dst, $src2, $mask}"),
   8762             []>, VEX_4VOp3, VEX_L;
   8763 }
   8764 
   8765 let mayLoad = 1, hasSideEffects = 0, Constraints
   8766   = "@earlyclobber $dst,@earlyclobber $mask_wb, $src1 = $dst, $mask = $mask_wb"
   8767   in {
   8768   defm VPGATHERDQ : avx2_gather<0x90, "vpgatherdq", VR256, vx128mem, vx256mem>, VEX_W;
   8769   defm VPGATHERQQ : avx2_gather<0x91, "vpgatherqq", VR256, vx128mem, vy256mem>, VEX_W;
   8770   defm VPGATHERDD : avx2_gather<0x90, "vpgatherdd", VR256, vx128mem, vy256mem>;
   8771   defm VPGATHERQD : avx2_gather<0x91, "vpgatherqd", VR128, vx64mem, vy128mem>;
   8772 
   8773   let ExeDomain = SSEPackedDouble in {
   8774     defm VGATHERDPD : avx2_gather<0x92, "vgatherdpd", VR256, vx128mem, vx256mem>, VEX_W;
   8775     defm VGATHERQPD : avx2_gather<0x93, "vgatherqpd", VR256, vx128mem, vy256mem>, VEX_W;
   8776   }
   8777 
   8778   let ExeDomain = SSEPackedSingle in {
   8779     defm VGATHERDPS : avx2_gather<0x92, "vgatherdps", VR256, vx128mem, vy256mem>;
   8780     defm VGATHERQPS : avx2_gather<0x93, "vgatherqps", VR128, vx64mem, vy128mem>;
   8781   }
   8782 }
   8783 
   8784 //===----------------------------------------------------------------------===//
   8785 // Extra selection patterns for FR128, f128, f128mem
   8786 
   8787 // movaps is shorter than movdqa. movaps is in SSE and movdqa is in SSE2.
   8788 def : Pat<(store (f128 FR128:$src), addr:$dst),
   8789           (MOVAPSmr addr:$dst, (COPY_TO_REGCLASS (f128 FR128:$src), VR128))>;
   8790 
   8791 def : Pat<(loadf128 addr:$src),
   8792           (COPY_TO_REGCLASS (MOVAPSrm addr:$src), FR128)>;
   8793 
   8794 // andps is shorter than andpd or pand. andps is SSE and andpd/pand are in SSE2
   8795 def : Pat<(X86fand FR128:$src1, (loadf128 addr:$src2)),
   8796           (COPY_TO_REGCLASS
   8797            (ANDPSrm (COPY_TO_REGCLASS FR128:$src1, VR128), f128mem:$src2),
   8798            FR128)>;
   8799 
   8800 def : Pat<(X86fand FR128:$src1, FR128:$src2),
   8801           (COPY_TO_REGCLASS
   8802            (ANDPSrr (COPY_TO_REGCLASS FR128:$src1, VR128),
   8803                     (COPY_TO_REGCLASS FR128:$src2, VR128)), FR128)>;
   8804 
   8805 def : Pat<(and FR128:$src1, FR128:$src2),
   8806           (COPY_TO_REGCLASS
   8807            (ANDPSrr (COPY_TO_REGCLASS FR128:$src1, VR128),
   8808                     (COPY_TO_REGCLASS FR128:$src2, VR128)), FR128)>;
   8809 
   8810 def : Pat<(X86for FR128:$src1, (loadf128 addr:$src2)),
   8811           (COPY_TO_REGCLASS
   8812            (ORPSrm (COPY_TO_REGCLASS FR128:$src1, VR128), f128mem:$src2),
   8813            FR128)>;
   8814 
   8815 def : Pat<(X86for FR128:$src1, FR128:$src2),
   8816           (COPY_TO_REGCLASS
   8817            (ORPSrr (COPY_TO_REGCLASS FR128:$src1, VR128),
   8818                    (COPY_TO_REGCLASS FR128:$src2, VR128)), FR128)>;
   8819 
   8820 def : Pat<(or FR128:$src1, FR128:$src2),
   8821           (COPY_TO_REGCLASS
   8822            (ORPSrr (COPY_TO_REGCLASS FR128:$src1, VR128),
   8823                    (COPY_TO_REGCLASS FR128:$src2, VR128)), FR128)>;
   8824 
   8825 def : Pat<(X86fxor FR128:$src1, (loadf128 addr:$src2)),
   8826           (COPY_TO_REGCLASS
   8827            (XORPSrm (COPY_TO_REGCLASS FR128:$src1, VR128), f128mem:$src2),
   8828            FR128)>;
   8829 
   8830 def : Pat<(X86fxor FR128:$src1, FR128:$src2),
   8831           (COPY_TO_REGCLASS
   8832            (XORPSrr (COPY_TO_REGCLASS FR128:$src1, VR128),
   8833                     (COPY_TO_REGCLASS FR128:$src2, VR128)), FR128)>;
   8834 
   8835 def : Pat<(xor FR128:$src1, FR128:$src2),
   8836           (COPY_TO_REGCLASS
   8837            (XORPSrr (COPY_TO_REGCLASS FR128:$src1, VR128),
   8838                     (COPY_TO_REGCLASS FR128:$src2, VR128)), FR128)>;
   8839