Home | History | Annotate | Download | only in src
      1 //===- subzero/src/IceTargetLoweringX86BaseImpl.h - x86 lowering -*- C++ -*-==//
      2 //
      3 //                        The Subzero Code Generator
      4 //
      5 // This file is distributed under the University of Illinois Open Source
      6 // License. See LICENSE.TXT for details.
      7 //
      8 //===----------------------------------------------------------------------===//
      9 ///
     10 /// \file
     11 /// \brief Implements the TargetLoweringX86Base class, which consists almost
     12 /// entirely of the lowering sequence for each high-level instruction.
     13 ///
     14 //===----------------------------------------------------------------------===//
     15 
     16 #ifndef SUBZERO_SRC_ICETARGETLOWERINGX86BASEIMPL_H
     17 #define SUBZERO_SRC_ICETARGETLOWERINGX86BASEIMPL_H
     18 
     19 #include "IceCfg.h"
     20 #include "IceCfgNode.h"
     21 #include "IceClFlags.h"
     22 #include "IceDefs.h"
     23 #include "IceELFObjectWriter.h"
     24 #include "IceGlobalInits.h"
     25 #include "IceInstVarIter.h"
     26 #include "IceInstX86Base.h"
     27 #include "IceLiveness.h"
     28 #include "IceOperand.h"
     29 #include "IcePhiLoweringImpl.h"
     30 #include "IceUtils.h"
     31 #include "IceVariableSplitting.h"
     32 
     33 #include "llvm/Support/MathExtras.h"
     34 
     35 #include <stack>
     36 
     37 namespace Ice {
     38 namespace X86 {
     39 template <typename T> struct PoolTypeConverter {};
     40 
     41 template <> struct PoolTypeConverter<float> {
     42   using PrimitiveIntType = uint32_t;
     43   using IceType = ConstantFloat;
     44   static const Type Ty = IceType_f32;
     45   static const char *TypeName;
     46   static const char *AsmTag;
     47   static const char *PrintfString;
     48 };
     49 
     50 template <> struct PoolTypeConverter<double> {
     51   using PrimitiveIntType = uint64_t;
     52   using IceType = ConstantDouble;
     53   static const Type Ty = IceType_f64;
     54   static const char *TypeName;
     55   static const char *AsmTag;
     56   static const char *PrintfString;
     57 };
     58 
     59 // Add converter for int type constant pooling
     60 template <> struct PoolTypeConverter<uint32_t> {
     61   using PrimitiveIntType = uint32_t;
     62   using IceType = ConstantInteger32;
     63   static const Type Ty = IceType_i32;
     64   static const char *TypeName;
     65   static const char *AsmTag;
     66   static const char *PrintfString;
     67 };
     68 
     69 // Add converter for int type constant pooling
     70 template <> struct PoolTypeConverter<uint16_t> {
     71   using PrimitiveIntType = uint32_t;
     72   using IceType = ConstantInteger32;
     73   static const Type Ty = IceType_i16;
     74   static const char *TypeName;
     75   static const char *AsmTag;
     76   static const char *PrintfString;
     77 };
     78 
     79 // Add converter for int type constant pooling
     80 template <> struct PoolTypeConverter<uint8_t> {
     81   using PrimitiveIntType = uint32_t;
     82   using IceType = ConstantInteger32;
     83   static const Type Ty = IceType_i8;
     84   static const char *TypeName;
     85   static const char *AsmTag;
     86   static const char *PrintfString;
     87 };
     88 } // end of namespace X86
     89 
     90 namespace X86NAMESPACE {
     91 
     92 using Utils::BoolFlagSaver;
     93 
     94 template <typename Traits> class BoolFoldingEntry {
     95   BoolFoldingEntry(const BoolFoldingEntry &) = delete;
     96 
     97 public:
     98   BoolFoldingEntry() = default;
     99   explicit BoolFoldingEntry(Inst *I);
    100   BoolFoldingEntry &operator=(const BoolFoldingEntry &) = default;
    101   /// Instr is the instruction producing the i1-type variable of interest.
    102   Inst *Instr = nullptr;
    103   /// IsComplex is the cached result of BoolFolding::hasComplexLowering(Instr).
    104   bool IsComplex = false;
    105   /// IsLiveOut is initialized conservatively to true, and is set to false when
    106   /// we encounter an instruction that ends Var's live range. We disable the
    107   /// folding optimization when Var is live beyond this basic block. Note that
    108   /// if liveness analysis is not performed (e.g. in Om1 mode), IsLiveOut will
    109   /// always be true and the folding optimization will never be performed.
    110   bool IsLiveOut = true;
    111   // NumUses counts the number of times Var is used as a source operand in the
    112   // basic block. If IsComplex is true and there is more than one use of Var,
    113   // then the folding optimization is disabled for Var.
    114   uint32_t NumUses = 0;
    115 };
    116 
    117 template <typename Traits> class BoolFolding {
    118 public:
    119   enum BoolFoldingProducerKind {
    120     PK_None,
    121     // TODO(jpp): PK_Icmp32 is no longer meaningful. Rename to PK_IcmpNative.
    122     PK_Icmp32,
    123     PK_Icmp64,
    124     PK_Fcmp,
    125     PK_Trunc,
    126     PK_Arith // A flag-setting arithmetic instruction.
    127   };
    128 
    129   /// Currently the actual enum values are not used (other than CK_None), but we
    130   /// go ahead and produce them anyway for symmetry with the
    131   /// BoolFoldingProducerKind.
    132   enum BoolFoldingConsumerKind { CK_None, CK_Br, CK_Select, CK_Sext, CK_Zext };
    133 
    134 private:
    135   BoolFolding(const BoolFolding &) = delete;
    136   BoolFolding &operator=(const BoolFolding &) = delete;
    137 
    138 public:
    139   BoolFolding() = default;
    140   static BoolFoldingProducerKind getProducerKind(const Inst *Instr);
    141   static BoolFoldingConsumerKind getConsumerKind(const Inst *Instr);
    142   static bool hasComplexLowering(const Inst *Instr);
    143   static bool isValidFolding(BoolFoldingProducerKind ProducerKind,
    144                              BoolFoldingConsumerKind ConsumerKind);
    145   void init(CfgNode *Node);
    146   const Inst *getProducerFor(const Operand *Opnd) const;
    147   void dump(const Cfg *Func) const;
    148 
    149 private:
    150   /// Returns true if Producers contains a valid entry for the given VarNum.
    151   bool containsValid(SizeT VarNum) const {
    152     auto Element = Producers.find(VarNum);
    153     return Element != Producers.end() && Element->second.Instr != nullptr;
    154   }
    155   void setInvalid(SizeT VarNum) { Producers[VarNum].Instr = nullptr; }
    156   void invalidateProducersOnStore(const Inst *Instr);
    157   /// Producers maps Variable::Number to a BoolFoldingEntry.
    158   CfgUnorderedMap<SizeT, BoolFoldingEntry<Traits>> Producers;
    159 };
    160 
    161 template <typename Traits>
    162 BoolFoldingEntry<Traits>::BoolFoldingEntry(Inst *I)
    163     : Instr(I), IsComplex(BoolFolding<Traits>::hasComplexLowering(I)) {}
    164 
    165 template <typename Traits>
    166 typename BoolFolding<Traits>::BoolFoldingProducerKind
    167 BoolFolding<Traits>::getProducerKind(const Inst *Instr) {
    168   if (llvm::isa<InstIcmp>(Instr)) {
    169     if (Traits::Is64Bit || Instr->getSrc(0)->getType() != IceType_i64)
    170       return PK_Icmp32;
    171     return PK_Icmp64;
    172   }
    173   if (llvm::isa<InstFcmp>(Instr))
    174     return PK_Fcmp;
    175   if (auto *Arith = llvm::dyn_cast<InstArithmetic>(Instr)) {
    176     if (Traits::Is64Bit || Arith->getSrc(0)->getType() != IceType_i64) {
    177       switch (Arith->getOp()) {
    178       default:
    179         return PK_None;
    180       case InstArithmetic::And:
    181       case InstArithmetic::Or:
    182         return PK_Arith;
    183       }
    184     }
    185   }
    186   return PK_None; // TODO(stichnot): remove this
    187 
    188   if (auto *Cast = llvm::dyn_cast<InstCast>(Instr)) {
    189     switch (Cast->getCastKind()) {
    190     default:
    191       return PK_None;
    192     case InstCast::Trunc:
    193       return PK_Trunc;
    194     }
    195   }
    196   return PK_None;
    197 }
    198 
    199 template <typename Traits>
    200 typename BoolFolding<Traits>::BoolFoldingConsumerKind
    201 BoolFolding<Traits>::getConsumerKind(const Inst *Instr) {
    202   if (llvm::isa<InstBr>(Instr))
    203     return CK_Br;
    204   if (llvm::isa<InstSelect>(Instr))
    205     return CK_Select;
    206   return CK_None; // TODO(stichnot): remove this
    207 
    208   if (auto *Cast = llvm::dyn_cast<InstCast>(Instr)) {
    209     switch (Cast->getCastKind()) {
    210     default:
    211       return CK_None;
    212     case InstCast::Sext:
    213       return CK_Sext;
    214     case InstCast::Zext:
    215       return CK_Zext;
    216     }
    217   }
    218   return CK_None;
    219 }
    220 
    221 /// Returns true if the producing instruction has a "complex" lowering sequence.
    222 /// This generally means that its lowering sequence requires more than one
    223 /// conditional branch, namely 64-bit integer compares and some floating-point
    224 /// compares. When this is true, and there is more than one consumer, we prefer
    225 /// to disable the folding optimization because it minimizes branches.
    226 template <typename Traits>
    227 bool BoolFolding<Traits>::hasComplexLowering(const Inst *Instr) {
    228   switch (getProducerKind(Instr)) {
    229   default:
    230     return false;
    231   case PK_Icmp64:
    232     return !Traits::Is64Bit;
    233   case PK_Fcmp:
    234     return Traits::TableFcmp[llvm::cast<InstFcmp>(Instr)->getCondition()].C2 !=
    235            Traits::Cond::Br_None;
    236   }
    237 }
    238 
    239 template <typename Traits>
    240 bool BoolFolding<Traits>::isValidFolding(
    241     typename BoolFolding<Traits>::BoolFoldingProducerKind ProducerKind,
    242     typename BoolFolding<Traits>::BoolFoldingConsumerKind ConsumerKind) {
    243   switch (ProducerKind) {
    244   default:
    245     return false;
    246   case PK_Icmp32:
    247   case PK_Icmp64:
    248   case PK_Fcmp:
    249     return (ConsumerKind == CK_Br) || (ConsumerKind == CK_Select);
    250   case PK_Arith:
    251     return ConsumerKind == CK_Br;
    252   }
    253 }
    254 
    255 template <typename Traits> void BoolFolding<Traits>::init(CfgNode *Node) {
    256   Producers.clear();
    257   for (Inst &Instr : Node->getInsts()) {
    258     if (Instr.isDeleted())
    259       continue;
    260     invalidateProducersOnStore(&Instr);
    261     // Check whether Instr is a valid producer.
    262     Variable *Var = Instr.getDest();
    263     if (Var) { // only consider instructions with an actual dest var
    264       if (isBooleanType(Var->getType())) {        // only bool-type dest vars
    265         if (getProducerKind(&Instr) != PK_None) { // white-listed instructions
    266           Producers[Var->getIndex()] = BoolFoldingEntry<Traits>(&Instr);
    267         }
    268       }
    269     }
    270     // Check each src variable against the map.
    271     FOREACH_VAR_IN_INST(Var, Instr) {
    272       SizeT VarNum = Var->getIndex();
    273       if (!containsValid(VarNum))
    274         continue;
    275       // All valid consumers use Var as the first source operand
    276       if (IndexOfVarOperandInInst(Var) != 0) {
    277         setInvalid(VarNum);
    278         continue;
    279       }
    280       // Consumer instructions must be white-listed
    281       typename BoolFolding<Traits>::BoolFoldingConsumerKind ConsumerKind =
    282           getConsumerKind(&Instr);
    283       if (ConsumerKind == CK_None) {
    284         setInvalid(VarNum);
    285         continue;
    286       }
    287       typename BoolFolding<Traits>::BoolFoldingProducerKind ProducerKind =
    288           getProducerKind(Producers[VarNum].Instr);
    289       if (!isValidFolding(ProducerKind, ConsumerKind)) {
    290         setInvalid(VarNum);
    291         continue;
    292       }
    293       // Avoid creating multiple copies of complex producer instructions.
    294       if (Producers[VarNum].IsComplex && Producers[VarNum].NumUses > 0) {
    295         setInvalid(VarNum);
    296         continue;
    297       }
    298       ++Producers[VarNum].NumUses;
    299       if (Instr.isLastUse(Var)) {
    300         Producers[VarNum].IsLiveOut = false;
    301       }
    302     }
    303   }
    304   for (auto &I : Producers) {
    305     // Ignore entries previously marked invalid.
    306     if (I.second.Instr == nullptr)
    307       continue;
    308     // Disable the producer if its dest may be live beyond this block.
    309     if (I.second.IsLiveOut) {
    310       setInvalid(I.first);
    311       continue;
    312     }
    313     // Mark as "dead" rather than outright deleting. This is so that other
    314     // peephole style optimizations during or before lowering have access to
    315     // this instruction in undeleted form. See for example
    316     // tryOptimizedCmpxchgCmpBr().
    317     I.second.Instr->setDead();
    318   }
    319 }
    320 
    321 template <typename Traits>
    322 const Inst *BoolFolding<Traits>::getProducerFor(const Operand *Opnd) const {
    323   auto *Var = llvm::dyn_cast<const Variable>(Opnd);
    324   if (Var == nullptr)
    325     return nullptr;
    326   SizeT VarNum = Var->getIndex();
    327   auto Element = Producers.find(VarNum);
    328   if (Element == Producers.end())
    329     return nullptr;
    330   return Element->second.Instr;
    331 }
    332 
    333 template <typename Traits>
    334 void BoolFolding<Traits>::dump(const Cfg *Func) const {
    335   if (!BuildDefs::dump() || !Func->isVerbose(IceV_Folding))
    336     return;
    337   OstreamLocker L(Func->getContext());
    338   Ostream &Str = Func->getContext()->getStrDump();
    339   for (auto &I : Producers) {
    340     if (I.second.Instr == nullptr)
    341       continue;
    342     Str << "Found foldable producer:\n  ";
    343     I.second.Instr->dump(Func);
    344     Str << "\n";
    345   }
    346 }
    347 
    348 /// If the given instruction has potential memory side effects (e.g. store, rmw,
    349 /// or a call instruction with potential memory side effects), then we must not
    350 /// allow a pre-store Producer instruction with memory operands to be folded
    351 /// into a post-store Consumer instruction.  If this is detected, the Producer
    352 /// is invalidated.
    353 ///
    354 /// We use the Producer's IsLiveOut field to determine whether any potential
    355 /// Consumers come after this store instruction.  The IsLiveOut field is
    356 /// initialized to true, and BoolFolding::init() sets IsLiveOut to false when it
    357 /// sees the variable's definitive last use (indicating the variable is not in
    358 /// the node's live-out set).  Thus if we see here that IsLiveOut is false, we
    359 /// know that there can be no consumers after the store, and therefore we know
    360 /// the folding is safe despite the store instruction.
    361 template <typename Traits>
    362 void BoolFolding<Traits>::invalidateProducersOnStore(const Inst *Instr) {
    363   if (!Instr->isMemoryWrite())
    364     return;
    365   for (auto &ProducerPair : Producers) {
    366     if (!ProducerPair.second.IsLiveOut)
    367       continue;
    368     Inst *PInst = ProducerPair.second.Instr;
    369     if (PInst == nullptr)
    370       continue;
    371     bool HasMemOperand = false;
    372     const SizeT SrcSize = PInst->getSrcSize();
    373     for (SizeT I = 0; I < SrcSize; ++I) {
    374       if (llvm::isa<typename Traits::X86OperandMem>(PInst->getSrc(I))) {
    375         HasMemOperand = true;
    376         break;
    377       }
    378     }
    379     if (!HasMemOperand)
    380       continue;
    381     setInvalid(ProducerPair.first);
    382   }
    383 }
    384 
    385 template <typename TraitsType>
    386 void TargetX86Base<TraitsType>::initNodeForLowering(CfgNode *Node) {
    387   FoldingInfo.init(Node);
    388   FoldingInfo.dump(Func);
    389 }
    390 
    391 template <typename TraitsType>
    392 TargetX86Base<TraitsType>::TargetX86Base(Cfg *Func)
    393     : TargetLowering(Func), NeedSandboxing(SandboxingType == ST_NaCl) {
    394   static_assert(
    395       (Traits::InstructionSet::End - Traits::InstructionSet::Begin) ==
    396           (TargetInstructionSet::X86InstructionSet_End -
    397            TargetInstructionSet::X86InstructionSet_Begin),
    398       "Traits::InstructionSet range different from TargetInstructionSet");
    399   if (getFlags().getTargetInstructionSet() !=
    400       TargetInstructionSet::BaseInstructionSet) {
    401     InstructionSet = static_cast<InstructionSetEnum>(
    402         (getFlags().getTargetInstructionSet() -
    403          TargetInstructionSet::X86InstructionSet_Begin) +
    404         Traits::InstructionSet::Begin);
    405   }
    406 }
    407 
    408 template <typename TraitsType>
    409 void TargetX86Base<TraitsType>::staticInit(GlobalContext *Ctx) {
    410   RegNumT::setLimit(Traits::RegisterSet::Reg_NUM);
    411   Traits::initRegisterSet(getFlags(), &TypeToRegisterSet, &RegisterAliases);
    412   for (size_t i = 0; i < TypeToRegisterSet.size(); ++i)
    413     TypeToRegisterSetUnfiltered[i] = TypeToRegisterSet[i];
    414   filterTypeToRegisterSet(Ctx, Traits::RegisterSet::Reg_NUM,
    415                           TypeToRegisterSet.data(), TypeToRegisterSet.size(),
    416                           Traits::getRegName, getRegClassName);
    417   PcRelFixup = Traits::FK_PcRel;
    418   AbsFixup = getFlags().getUseNonsfi() ? Traits::FK_Gotoff : Traits::FK_Abs;
    419 }
    420 
    421 template <typename TraitsType>
    422 bool TargetX86Base<TraitsType>::shouldBePooled(const Constant *C) {
    423   if (auto *ConstFloat = llvm::dyn_cast<ConstantFloat>(C)) {
    424     return !Utils::isPositiveZero(ConstFloat->getValue());
    425   }
    426   if (auto *ConstDouble = llvm::dyn_cast<ConstantDouble>(C)) {
    427     return !Utils::isPositiveZero(ConstDouble->getValue());
    428   }
    429   if (getFlags().getRandomizeAndPoolImmediatesOption() != RPI_Pool) {
    430     return false;
    431   }
    432   return C->shouldBeRandomizedOrPooled();
    433 }
    434 
    435 template <typename TraitsType>
    436 ::Ice::Type TargetX86Base<TraitsType>::getPointerType() {
    437   if (!Traits::Is64Bit ||
    438       ::Ice::getFlags().getApplicationBinaryInterface() == ::Ice::ABI_PNaCl) {
    439     return ::Ice::IceType_i32;
    440   }
    441   return ::Ice::IceType_i64;
    442 }
    443 
    444 template <typename TraitsType> void TargetX86Base<TraitsType>::translateO2() {
    445   TimerMarker T(TimerStack::TT_O2, Func);
    446 
    447   if (SandboxingType != ST_None) {
    448     initRebasePtr();
    449   }
    450 
    451   genTargetHelperCalls();
    452   Func->dump("After target helper call insertion");
    453 
    454   // Merge Alloca instructions, and lay out the stack.
    455   static constexpr bool SortAndCombineAllocas = true;
    456   Func->processAllocas(SortAndCombineAllocas);
    457   Func->dump("After Alloca processing");
    458 
    459   // Run this early so it can be used to focus optimizations on potentially hot
    460   // code.
    461   // TODO(stichnot,ascull): currently only used for regalloc not
    462   // expensive high level optimizations which could be focused on potentially
    463   // hot code.
    464   Func->generateLoopInfo();
    465   Func->dump("After loop analysis");
    466   if (getFlags().getLoopInvariantCodeMotion()) {
    467     Func->loopInvariantCodeMotion();
    468     Func->dump("After LICM");
    469   }
    470 
    471   if (getFlags().getLocalCSE() != Ice::LCSE_Disabled) {
    472     Func->localCSE(getFlags().getLocalCSE() == Ice::LCSE_EnabledSSA);
    473     Func->dump("After Local CSE");
    474     Func->floatConstantCSE();
    475   }
    476   if (getFlags().getEnableShortCircuit()) {
    477     Func->shortCircuitJumps();
    478     Func->dump("After Short Circuiting");
    479   }
    480 
    481   if (!getFlags().getEnablePhiEdgeSplit()) {
    482     // Lower Phi instructions.
    483     Func->placePhiLoads();
    484     if (Func->hasError())
    485       return;
    486     Func->placePhiStores();
    487     if (Func->hasError())
    488       return;
    489     Func->deletePhis();
    490     if (Func->hasError())
    491       return;
    492     Func->dump("After Phi lowering");
    493   }
    494 
    495   // Address mode optimization.
    496   Func->getVMetadata()->init(VMK_SingleDefs);
    497   Func->doAddressOpt();
    498   Func->materializeVectorShuffles();
    499 
    500   // Find read-modify-write opportunities. Do this after address mode
    501   // optimization so that doAddressOpt() doesn't need to be applied to RMW
    502   // instructions as well.
    503   findRMW();
    504   Func->dump("After RMW transform");
    505 
    506   // Argument lowering
    507   Func->doArgLowering();
    508 
    509   // Target lowering. This requires liveness analysis for some parts of the
    510   // lowering decisions, such as compare/branch fusing. If non-lightweight
    511   // liveness analysis is used, the instructions need to be renumbered first
    512   // TODO: This renumbering should only be necessary if we're actually
    513   // calculating live intervals, which we only do for register allocation.
    514   Func->renumberInstructions();
    515   if (Func->hasError())
    516     return;
    517 
    518   // TODO: It should be sufficient to use the fastest liveness calculation,
    519   // i.e. livenessLightweight(). However, for some reason that slows down the
    520   // rest of the translation. Investigate.
    521   Func->liveness(Liveness_Basic);
    522   if (Func->hasError())
    523     return;
    524   Func->dump("After x86 address mode opt");
    525 
    526   // Disable constant blinding or pooling for load optimization.
    527   {
    528     BoolFlagSaver B(RandomizationPoolingPaused, true);
    529     doLoadOpt();
    530   }
    531   Func->genCode();
    532   if (Func->hasError())
    533     return;
    534   if (SandboxingType != ST_None) {
    535     initSandbox();
    536   }
    537   Func->dump("After x86 codegen");
    538   splitBlockLocalVariables(Func);
    539 
    540   // Register allocation. This requires instruction renumbering and full
    541   // liveness analysis. Loops must be identified before liveness so variable
    542   // use weights are correct.
    543   Func->renumberInstructions();
    544   if (Func->hasError())
    545     return;
    546   Func->liveness(Liveness_Intervals);
    547   if (Func->hasError())
    548     return;
    549   // The post-codegen dump is done here, after liveness analysis and associated
    550   // cleanup, to make the dump cleaner and more useful.
    551   Func->dump("After initial x86 codegen");
    552   // Validate the live range computations. The expensive validation call is
    553   // deliberately only made when assertions are enabled.
    554   assert(Func->validateLiveness());
    555   Func->getVMetadata()->init(VMK_All);
    556   regAlloc(RAK_Global);
    557   if (Func->hasError())
    558     return;
    559   Func->dump("After linear scan regalloc");
    560 
    561   if (getFlags().getEnablePhiEdgeSplit()) {
    562     Func->advancedPhiLowering();
    563     Func->dump("After advanced Phi lowering");
    564   }
    565 
    566   // Stack frame mapping.
    567   Func->genFrame();
    568   if (Func->hasError())
    569     return;
    570   Func->dump("After stack frame mapping");
    571 
    572   Func->contractEmptyNodes();
    573   Func->reorderNodes();
    574 
    575   // Shuffle basic block order if -reorder-basic-blocks is enabled.
    576   Func->shuffleNodes();
    577 
    578   // Branch optimization.  This needs to be done just before code emission. In
    579   // particular, no transformations that insert or reorder CfgNodes should be
    580   // done after branch optimization. We go ahead and do it before nop insertion
    581   // to reduce the amount of work needed for searching for opportunities.
    582   Func->doBranchOpt();
    583   Func->dump("After branch optimization");
    584 
    585   // Nop insertion if -nop-insertion is enabled.
    586   Func->doNopInsertion();
    587 
    588   // Mark nodes that require sandbox alignment
    589   if (NeedSandboxing) {
    590     Func->markNodesForSandboxing();
    591   }
    592 }
    593 
    594 template <typename TraitsType> void TargetX86Base<TraitsType>::translateOm1() {
    595   TimerMarker T(TimerStack::TT_Om1, Func);
    596 
    597   if (SandboxingType != ST_None) {
    598     initRebasePtr();
    599   }
    600 
    601   genTargetHelperCalls();
    602 
    603   // Do not merge Alloca instructions, and lay out the stack.
    604   static constexpr bool SortAndCombineAllocas = false;
    605   Func->processAllocas(SortAndCombineAllocas);
    606   Func->dump("After Alloca processing");
    607 
    608   Func->placePhiLoads();
    609   if (Func->hasError())
    610     return;
    611   Func->placePhiStores();
    612   if (Func->hasError())
    613     return;
    614   Func->deletePhis();
    615   if (Func->hasError())
    616     return;
    617   Func->dump("After Phi lowering");
    618 
    619   Func->doArgLowering();
    620   Func->genCode();
    621   if (Func->hasError())
    622     return;
    623   if (SandboxingType != ST_None) {
    624     initSandbox();
    625   }
    626   Func->dump("After initial x86 codegen");
    627 
    628   regAlloc(RAK_InfOnly);
    629   if (Func->hasError())
    630     return;
    631   Func->dump("After regalloc of infinite-weight variables");
    632 
    633   Func->genFrame();
    634   if (Func->hasError())
    635     return;
    636   Func->dump("After stack frame mapping");
    637 
    638   // Shuffle basic block order if -reorder-basic-blocks is enabled.
    639   Func->shuffleNodes();
    640 
    641   // Nop insertion if -nop-insertion is enabled.
    642   Func->doNopInsertion();
    643 
    644   // Mark nodes that require sandbox alignment
    645   if (NeedSandboxing)
    646     Func->markNodesForSandboxing();
    647 }
    648 
    649 inline bool canRMW(const InstArithmetic *Arith) {
    650   Type Ty = Arith->getDest()->getType();
    651   // X86 vector instructions write to a register and have no RMW option.
    652   if (isVectorType(Ty))
    653     return false;
    654   bool isI64 = Ty == IceType_i64;
    655 
    656   switch (Arith->getOp()) {
    657   // Not handled for lack of simple lowering:
    658   //   shift on i64
    659   //   mul, udiv, urem, sdiv, srem, frem
    660   // Not handled for lack of RMW instructions:
    661   //   fadd, fsub, fmul, fdiv (also vector types)
    662   default:
    663     return false;
    664   case InstArithmetic::Add:
    665   case InstArithmetic::Sub:
    666   case InstArithmetic::And:
    667   case InstArithmetic::Or:
    668   case InstArithmetic::Xor:
    669     return true;
    670   case InstArithmetic::Shl:
    671   case InstArithmetic::Lshr:
    672   case InstArithmetic::Ashr:
    673     return false; // TODO(stichnot): implement
    674     return !isI64;
    675   }
    676 }
    677 
    678 template <typename TraitsType>
    679 bool isSameMemAddressOperand(const Operand *A, const Operand *B) {
    680   if (A == B)
    681     return true;
    682   if (auto *MemA =
    683           llvm::dyn_cast<typename TargetX86Base<TraitsType>::X86OperandMem>(
    684               A)) {
    685     if (auto *MemB =
    686             llvm::dyn_cast<typename TargetX86Base<TraitsType>::X86OperandMem>(
    687                 B)) {
    688       return MemA->getBase() == MemB->getBase() &&
    689              MemA->getOffset() == MemB->getOffset() &&
    690              MemA->getIndex() == MemB->getIndex() &&
    691              MemA->getShift() == MemB->getShift() &&
    692              MemA->getSegmentRegister() == MemB->getSegmentRegister();
    693     }
    694   }
    695   return false;
    696 }
    697 
    698 template <typename TraitsType> void TargetX86Base<TraitsType>::findRMW() {
    699   TimerMarker _(TimerStack::TT_findRMW, Func);
    700   Func->dump("Before RMW");
    701   if (Func->isVerbose(IceV_RMW))
    702     Func->getContext()->lockStr();
    703   for (CfgNode *Node : Func->getNodes()) {
    704     // Walk through the instructions, considering each sequence of 3
    705     // instructions, and look for the particular RMW pattern. Note that this
    706     // search can be "broken" (false negatives) if there are intervening
    707     // deleted instructions, or intervening instructions that could be safely
    708     // moved out of the way to reveal an RMW pattern.
    709     auto E = Node->getInsts().end();
    710     auto I1 = E, I2 = E, I3 = Node->getInsts().begin();
    711     for (; I3 != E; I1 = I2, I2 = I3, ++I3) {
    712       // Make I3 skip over deleted instructions.
    713       while (I3 != E && I3->isDeleted())
    714         ++I3;
    715       if (I1 == E || I2 == E || I3 == E)
    716         continue;
    717       assert(!I1->isDeleted());
    718       assert(!I2->isDeleted());
    719       assert(!I3->isDeleted());
    720       auto *Load = llvm::dyn_cast<InstLoad>(I1);
    721       auto *Arith = llvm::dyn_cast<InstArithmetic>(I2);
    722       auto *Store = llvm::dyn_cast<InstStore>(I3);
    723       if (!Load || !Arith || !Store)
    724         continue;
    725       // Look for:
    726       //   a = Load addr
    727       //   b = <op> a, other
    728       //   Store b, addr
    729       // Change to:
    730       //   a = Load addr
    731       //   b = <op> a, other
    732       //   x = FakeDef
    733       //   RMW <op>, addr, other, x
    734       //   b = Store b, addr, x
    735       // Note that inferTwoAddress() makes sure setDestRedefined() gets called
    736       // on the updated Store instruction, to avoid liveness problems later.
    737       //
    738       // With this transformation, the Store instruction acquires a Dest
    739       // variable and is now subject to dead code elimination if there are no
    740       // more uses of "b".  Variable "x" is a beacon for determining whether the
    741       // Store instruction gets dead-code eliminated.  If the Store instruction
    742       // is eliminated, then it must be the case that the RMW instruction ends
    743       // x's live range, and therefore the RMW instruction will be retained and
    744       // later lowered.  On the other hand, if the RMW instruction does not end
    745       // x's live range, then the Store instruction must still be present, and
    746       // therefore the RMW instruction is ignored during lowering because it is
    747       // redundant with the Store instruction.
    748       //
    749       // Note that if "a" has further uses, the RMW transformation may still
    750       // trigger, resulting in two loads and one store, which is worse than the
    751       // original one load and one store.  However, this is probably rare, and
    752       // caching probably keeps it just as fast.
    753       if (!isSameMemAddressOperand<TraitsType>(Load->getSourceAddress(),
    754                                                Store->getAddr()))
    755         continue;
    756       Operand *ArithSrcFromLoad = Arith->getSrc(0);
    757       Operand *ArithSrcOther = Arith->getSrc(1);
    758       if (ArithSrcFromLoad != Load->getDest()) {
    759         if (!Arith->isCommutative() || ArithSrcOther != Load->getDest())
    760           continue;
    761         std::swap(ArithSrcFromLoad, ArithSrcOther);
    762       }
    763       if (Arith->getDest() != Store->getData())
    764         continue;
    765       if (!canRMW(Arith))
    766         continue;
    767       if (Func->isVerbose(IceV_RMW)) {
    768         Ostream &Str = Func->getContext()->getStrDump();
    769         Str << "Found RMW in " << Func->getFunctionName() << ":\n  ";
    770         Load->dump(Func);
    771         Str << "\n  ";
    772         Arith->dump(Func);
    773         Str << "\n  ";
    774         Store->dump(Func);
    775         Str << "\n";
    776       }
    777       Variable *Beacon = Func->makeVariable(IceType_i32);
    778       Beacon->setMustNotHaveReg();
    779       Store->setRmwBeacon(Beacon);
    780       auto *BeaconDef = InstFakeDef::create(Func, Beacon);
    781       Node->getInsts().insert(I3, BeaconDef);
    782       auto *RMW = InstX86FakeRMW::create(Func, ArithSrcOther, Store->getAddr(),
    783                                          Beacon, Arith->getOp());
    784       Node->getInsts().insert(I3, RMW);
    785     }
    786   }
    787   if (Func->isVerbose(IceV_RMW))
    788     Func->getContext()->unlockStr();
    789 }
    790 
    791 // Converts a ConstantInteger32 operand into its constant value, or
    792 // MemoryOrderInvalid if the operand is not a ConstantInteger32.
    793 inline uint64_t getConstantMemoryOrder(Operand *Opnd) {
    794   if (auto *Integer = llvm::dyn_cast<ConstantInteger32>(Opnd))
    795     return Integer->getValue();
    796   return Intrinsics::MemoryOrderInvalid;
    797 }
    798 
    799 /// Determines whether the dest of a Load instruction can be folded into one of
    800 /// the src operands of a 2-operand instruction. This is true as long as the
    801 /// load dest matches exactly one of the binary instruction's src operands.
    802 /// Replaces Src0 or Src1 with LoadSrc if the answer is true.
    803 inline bool canFoldLoadIntoBinaryInst(Operand *LoadSrc, Variable *LoadDest,
    804                                       Operand *&Src0, Operand *&Src1) {
    805   if (Src0 == LoadDest && Src1 != LoadDest) {
    806     Src0 = LoadSrc;
    807     return true;
    808   }
    809   if (Src0 != LoadDest && Src1 == LoadDest) {
    810     Src1 = LoadSrc;
    811     return true;
    812   }
    813   return false;
    814 }
    815 
    816 template <typename TraitsType> void TargetX86Base<TraitsType>::doLoadOpt() {
    817   TimerMarker _(TimerStack::TT_loadOpt, Func);
    818   for (CfgNode *Node : Func->getNodes()) {
    819     Context.init(Node);
    820     while (!Context.atEnd()) {
    821       Variable *LoadDest = nullptr;
    822       Operand *LoadSrc = nullptr;
    823       Inst *CurInst = iteratorToInst(Context.getCur());
    824       Inst *Next = Context.getNextInst();
    825       // Determine whether the current instruction is a Load instruction or
    826       // equivalent.
    827       if (auto *Load = llvm::dyn_cast<InstLoad>(CurInst)) {
    828         // An InstLoad always qualifies.
    829         LoadDest = Load->getDest();
    830         constexpr bool DoLegalize = false;
    831         LoadSrc = formMemoryOperand(Load->getSourceAddress(),
    832                                     LoadDest->getType(), DoLegalize);
    833       } else if (auto *Intrin = llvm::dyn_cast<InstIntrinsicCall>(CurInst)) {
    834         // An AtomicLoad intrinsic qualifies as long as it has a valid memory
    835         // ordering, and can be implemented in a single instruction (i.e., not
    836         // i64 on x86-32).
    837         Intrinsics::IntrinsicID ID = Intrin->getIntrinsicInfo().ID;
    838         if (ID == Intrinsics::AtomicLoad &&
    839             (Traits::Is64Bit || Intrin->getDest()->getType() != IceType_i64) &&
    840             Intrinsics::isMemoryOrderValid(
    841                 ID, getConstantMemoryOrder(Intrin->getArg(1)))) {
    842           LoadDest = Intrin->getDest();
    843           constexpr bool DoLegalize = false;
    844           LoadSrc = formMemoryOperand(Intrin->getArg(0), LoadDest->getType(),
    845                                       DoLegalize);
    846         }
    847       }
    848       // A Load instruction can be folded into the following instruction only
    849       // if the following instruction ends the Load's Dest variable's live
    850       // range.
    851       if (LoadDest && Next && Next->isLastUse(LoadDest)) {
    852         assert(LoadSrc);
    853         Inst *NewInst = nullptr;
    854         if (auto *Arith = llvm::dyn_cast<InstArithmetic>(Next)) {
    855           Operand *Src0 = Arith->getSrc(0);
    856           Operand *Src1 = Arith->getSrc(1);
    857           if (canFoldLoadIntoBinaryInst(LoadSrc, LoadDest, Src0, Src1)) {
    858             NewInst = InstArithmetic::create(Func, Arith->getOp(),
    859                                              Arith->getDest(), Src0, Src1);
    860           }
    861         } else if (auto *Icmp = llvm::dyn_cast<InstIcmp>(Next)) {
    862           Operand *Src0 = Icmp->getSrc(0);
    863           Operand *Src1 = Icmp->getSrc(1);
    864           if (canFoldLoadIntoBinaryInst(LoadSrc, LoadDest, Src0, Src1)) {
    865             NewInst = InstIcmp::create(Func, Icmp->getCondition(),
    866                                        Icmp->getDest(), Src0, Src1);
    867           }
    868         } else if (auto *Fcmp = llvm::dyn_cast<InstFcmp>(Next)) {
    869           Operand *Src0 = Fcmp->getSrc(0);
    870           Operand *Src1 = Fcmp->getSrc(1);
    871           if (canFoldLoadIntoBinaryInst(LoadSrc, LoadDest, Src0, Src1)) {
    872             NewInst = InstFcmp::create(Func, Fcmp->getCondition(),
    873                                        Fcmp->getDest(), Src0, Src1);
    874           }
    875         } else if (auto *Select = llvm::dyn_cast<InstSelect>(Next)) {
    876           Operand *Src0 = Select->getTrueOperand();
    877           Operand *Src1 = Select->getFalseOperand();
    878           if (canFoldLoadIntoBinaryInst(LoadSrc, LoadDest, Src0, Src1)) {
    879             NewInst = InstSelect::create(Func, Select->getDest(),
    880                                          Select->getCondition(), Src0, Src1);
    881           }
    882         } else if (auto *Cast = llvm::dyn_cast<InstCast>(Next)) {
    883           // The load dest can always be folded into a Cast instruction.
    884           auto *Src0 = llvm::dyn_cast<Variable>(Cast->getSrc(0));
    885           if (Src0 == LoadDest) {
    886             NewInst = InstCast::create(Func, Cast->getCastKind(),
    887                                        Cast->getDest(), LoadSrc);
    888           }
    889         }
    890         if (NewInst) {
    891           CurInst->setDeleted();
    892           Next->setDeleted();
    893           Context.insert(NewInst);
    894           // Update NewInst->LiveRangesEnded so that target lowering may
    895           // benefit. Also update NewInst->HasSideEffects.
    896           NewInst->spliceLivenessInfo(Next, CurInst);
    897         }
    898       }
    899       Context.advanceCur();
    900       Context.advanceNext();
    901     }
    902   }
    903   Func->dump("After load optimization");
    904 }
    905 
    906 template <typename TraitsType>
    907 bool TargetX86Base<TraitsType>::doBranchOpt(Inst *I, const CfgNode *NextNode) {
    908   if (auto *Br = llvm::dyn_cast<InstX86Br>(I)) {
    909     return Br->optimizeBranch(NextNode);
    910   }
    911   return false;
    912 }
    913 
    914 template <typename TraitsType>
    915 Variable *TargetX86Base<TraitsType>::getPhysicalRegister(RegNumT RegNum,
    916                                                          Type Ty) {
    917   if (Ty == IceType_void)
    918     Ty = IceType_i32;
    919   if (PhysicalRegisters[Ty].empty())
    920     PhysicalRegisters[Ty].resize(Traits::RegisterSet::Reg_NUM);
    921   assert(unsigned(RegNum) < PhysicalRegisters[Ty].size());
    922   Variable *Reg = PhysicalRegisters[Ty][RegNum];
    923   if (Reg == nullptr) {
    924     Reg = Func->makeVariable(Ty);
    925     Reg->setRegNum(RegNum);
    926     PhysicalRegisters[Ty][RegNum] = Reg;
    927     // Specially mark a named physical register as an "argument" so that it is
    928     // considered live upon function entry.  Otherwise it's possible to get
    929     // liveness validation errors for saving callee-save registers.
    930     Func->addImplicitArg(Reg);
    931     // Don't bother tracking the live range of a named physical register.
    932     Reg->setIgnoreLiveness();
    933   }
    934   assert(Traits::getGprForType(Ty, RegNum) == RegNum);
    935   return Reg;
    936 }
    937 
    938 template <typename TraitsType>
    939 const char *TargetX86Base<TraitsType>::getRegName(RegNumT RegNum,
    940                                                   Type Ty) const {
    941   return Traits::getRegName(Traits::getGprForType(Ty, RegNum));
    942 }
    943 
    944 template <typename TraitsType>
    945 void TargetX86Base<TraitsType>::emitVariable(const Variable *Var) const {
    946   if (!BuildDefs::dump())
    947     return;
    948   Ostream &Str = Ctx->getStrEmit();
    949   if (Var->hasReg()) {
    950     const bool Is64BitSandboxing = Traits::Is64Bit && NeedSandboxing;
    951     const Type VarType = (Var->isRematerializable() && Is64BitSandboxing)
    952                              ? IceType_i64
    953                              : Var->getType();
    954     Str << "%" << getRegName(Var->getRegNum(), VarType);
    955     return;
    956   }
    957   if (Var->mustHaveReg()) {
    958     llvm::report_fatal_error("Infinite-weight Variable (" + Var->getName() +
    959                              ") has no register assigned - function " +
    960                              Func->getFunctionName());
    961   }
    962   const int32_t Offset = Var->getStackOffset();
    963   auto BaseRegNum = Var->getBaseRegNum();
    964   if (BaseRegNum.hasNoValue())
    965     BaseRegNum = getFrameOrStackReg();
    966 
    967   // Print in the form "Offset(%reg)", omitting Offset when it is 0.
    968   if (getFlags().getDecorateAsm()) {
    969     Str << Var->getSymbolicStackOffset();
    970   } else if (Offset != 0) {
    971     Str << Offset;
    972   }
    973   const Type FrameSPTy = Traits::WordType;
    974   Str << "(%" << getRegName(BaseRegNum, FrameSPTy) << ")";
    975 }
    976 
    977 template <typename TraitsType>
    978 typename TargetX86Base<TraitsType>::X86Address
    979 TargetX86Base<TraitsType>::stackVarToAsmOperand(const Variable *Var) const {
    980   if (Var->hasReg())
    981     llvm::report_fatal_error("Stack Variable has a register assigned");
    982   if (Var->mustHaveReg()) {
    983     llvm::report_fatal_error("Infinite-weight Variable (" + Var->getName() +
    984                              ") has no register assigned - function " +
    985                              Func->getFunctionName());
    986   }
    987   int32_t Offset = Var->getStackOffset();
    988   auto BaseRegNum = Var->getBaseRegNum();
    989   if (Var->getBaseRegNum().hasNoValue())
    990     BaseRegNum = getFrameOrStackReg();
    991   return X86Address(Traits::getEncodedGPR(BaseRegNum), Offset,
    992                     AssemblerFixup::NoFixup);
    993 }
    994 
    995 template <typename TraitsType>
    996 void TargetX86Base<TraitsType>::addProlog(CfgNode *Node) {
    997   // Stack frame layout:
    998   //
    999   // +------------------------+
   1000   // | 1. return address      |
   1001   // +------------------------+
   1002   // | 2. preserved registers |
   1003   // +------------------------+ <--- BasePointer (if used)
   1004   // | 3. padding             |
   1005   // +------------------------+
   1006   // | 4. global spill area   |
   1007   // +------------------------+
   1008   // | 5. padding             |
   1009   // +------------------------+
   1010   // | 6. local spill area    |
   1011   // +------------------------+
   1012   // | 7. padding             |
   1013   // +------------------------+
   1014   // | 8. allocas             |
   1015   // +------------------------+
   1016   // | 9. padding             |
   1017   // +------------------------+
   1018   // | 10. out args           |
   1019   // +------------------------+ <--- StackPointer
   1020   //
   1021   // The following variables record the size in bytes of the given areas:
   1022   //  * X86_RET_IP_SIZE_BYTES:   area 1
   1023   //  * PreservedRegsSizeBytes:  area 2
   1024   //  * SpillAreaPaddingBytes:   area 3
   1025   //  * GlobalsSize:             area 4
   1026   //  * LocalsSlotsPaddingBytes: area 5
   1027   //  * GlobalsAndSubsequentPaddingSize: areas 4 - 5
   1028   //  * LocalsSpillAreaSize:     area 6
   1029   //  * FixedAllocaSizeBytes:    areas 7 - 8
   1030   //  * SpillAreaSizeBytes:      areas 3 - 10
   1031   //  * maxOutArgsSizeBytes():   areas 9 - 10
   1032 
   1033   // Determine stack frame offsets for each Variable without a register
   1034   // assignment. This can be done as one variable per stack slot. Or, do
   1035   // coalescing by running the register allocator again with an infinite set of
   1036   // registers (as a side effect, this gives variables a second chance at
   1037   // physical register assignment).
   1038   //
   1039   // A middle ground approach is to leverage sparsity and allocate one block of
   1040   // space on the frame for globals (variables with multi-block lifetime), and
   1041   // one block to share for locals (single-block lifetime).
   1042 
   1043   Context.init(Node);
   1044   Context.setInsertPoint(Context.getCur());
   1045 
   1046   SmallBitVector CalleeSaves = getRegisterSet(RegSet_CalleeSave, RegSet_None);
   1047   RegsUsed = SmallBitVector(CalleeSaves.size());
   1048   VarList SortedSpilledVariables, VariablesLinkedToSpillSlots;
   1049   size_t GlobalsSize = 0;
   1050   // If there is a separate locals area, this represents that area. Otherwise
   1051   // it counts any variable not counted by GlobalsSize.
   1052   SpillAreaSizeBytes = 0;
   1053   // If there is a separate locals area, this specifies the alignment for it.
   1054   uint32_t LocalsSlotsAlignmentBytes = 0;
   1055   // The entire spill locations area gets aligned to largest natural alignment
   1056   // of the variables that have a spill slot.
   1057   uint32_t SpillAreaAlignmentBytes = 0;
   1058   // A spill slot linked to a variable with a stack slot should reuse that
   1059   // stack slot.
   1060   std::function<bool(Variable *)> TargetVarHook =
   1061       [&VariablesLinkedToSpillSlots](Variable *Var) {
   1062         // TODO(stichnot): Refactor this into the base class.
   1063         Variable *Root = Var->getLinkedToStackRoot();
   1064         if (Root != nullptr) {
   1065           assert(!Root->hasReg());
   1066           if (!Root->hasReg()) {
   1067             VariablesLinkedToSpillSlots.push_back(Var);
   1068             return true;
   1069           }
   1070         }
   1071         return false;
   1072       };
   1073 
   1074   // Compute the list of spilled variables and bounds for GlobalsSize, etc.
   1075   getVarStackSlotParams(SortedSpilledVariables, RegsUsed, &GlobalsSize,
   1076                         &SpillAreaSizeBytes, &SpillAreaAlignmentBytes,
   1077                         &LocalsSlotsAlignmentBytes, TargetVarHook);
   1078   uint32_t LocalsSpillAreaSize = SpillAreaSizeBytes;
   1079   SpillAreaSizeBytes += GlobalsSize;
   1080 
   1081   // Add push instructions for preserved registers.
   1082   uint32_t NumCallee = 0;
   1083   size_t PreservedRegsSizeBytes = 0;
   1084   SmallBitVector Pushed(CalleeSaves.size());
   1085   for (RegNumT i : RegNumBVIter(CalleeSaves)) {
   1086     const auto Canonical = Traits::getBaseReg(i);
   1087     assert(Canonical == Traits::getBaseReg(Canonical));
   1088     if (RegsUsed[i]) {
   1089       Pushed[Canonical] = true;
   1090     }
   1091   }
   1092   for (RegNumT RegNum : RegNumBVIter(Pushed)) {
   1093     assert(RegNum == Traits::getBaseReg(RegNum));
   1094     ++NumCallee;
   1095     PreservedRegsSizeBytes += typeWidthInBytes(Traits::WordType);
   1096     _push_reg(getPhysicalRegister(RegNum, Traits::WordType));
   1097   }
   1098   Ctx->statsUpdateRegistersSaved(NumCallee);
   1099 
   1100   // Generate "push frameptr; mov frameptr, stackptr"
   1101   if (IsEbpBasedFrame) {
   1102     assert((RegsUsed & getRegisterSet(RegSet_FramePointer, RegSet_None))
   1103                .count() == 0);
   1104     PreservedRegsSizeBytes += typeWidthInBytes(Traits::WordType);
   1105     _link_bp();
   1106   }
   1107 
   1108   // Align the variables area. SpillAreaPaddingBytes is the size of the region
   1109   // after the preserved registers and before the spill areas.
   1110   // LocalsSlotsPaddingBytes is the amount of padding between the globals and
   1111   // locals area if they are separate.
   1112   assert(LocalsSlotsAlignmentBytes <= SpillAreaAlignmentBytes);
   1113   uint32_t SpillAreaPaddingBytes = 0;
   1114   uint32_t LocalsSlotsPaddingBytes = 0;
   1115   alignStackSpillAreas(Traits::X86_RET_IP_SIZE_BYTES + PreservedRegsSizeBytes,
   1116                        SpillAreaAlignmentBytes, GlobalsSize,
   1117                        LocalsSlotsAlignmentBytes, &SpillAreaPaddingBytes,
   1118                        &LocalsSlotsPaddingBytes);
   1119   SpillAreaSizeBytes += SpillAreaPaddingBytes + LocalsSlotsPaddingBytes;
   1120   uint32_t GlobalsAndSubsequentPaddingSize =
   1121       GlobalsSize + LocalsSlotsPaddingBytes;
   1122 
   1123   // Functions returning scalar floating point types may need to convert values
   1124   // from an in-register xmm value to the top of the x87 floating point stack.
   1125   // This is done by a movp[sd] and an fld[sd].  Ensure there is enough scratch
   1126   // space on the stack for this.
   1127   const Type ReturnType = Func->getReturnType();
   1128   if (!Traits::X86_PASS_SCALAR_FP_IN_XMM) {
   1129     if (isScalarFloatingType(ReturnType)) {
   1130       // Avoid misaligned double-precision load/store.
   1131       RequiredStackAlignment = std::max<size_t>(
   1132           RequiredStackAlignment, Traits::X86_STACK_ALIGNMENT_BYTES);
   1133       SpillAreaSizeBytes =
   1134           std::max(typeWidthInBytesOnStack(ReturnType), SpillAreaSizeBytes);
   1135     }
   1136   }
   1137 
   1138   RequiredStackAlignment =
   1139       std::max<size_t>(RequiredStackAlignment, SpillAreaAlignmentBytes);
   1140 
   1141   if (PrologEmitsFixedAllocas) {
   1142     RequiredStackAlignment =
   1143         std::max(RequiredStackAlignment, FixedAllocaAlignBytes);
   1144   }
   1145 
   1146   // Combine fixed allocations into SpillAreaSizeBytes if we are emitting the
   1147   // fixed allocations in the prolog.
   1148   if (PrologEmitsFixedAllocas)
   1149     SpillAreaSizeBytes += FixedAllocaSizeBytes;
   1150 
   1151   // Entering the function has made the stack pointer unaligned. Re-align it by
   1152   // adjusting the stack size.
   1153   uint32_t StackOffset = Traits::X86_RET_IP_SIZE_BYTES + PreservedRegsSizeBytes;
   1154   uint32_t StackSize = Utils::applyAlignment(StackOffset + SpillAreaSizeBytes,
   1155                                              RequiredStackAlignment);
   1156   StackSize = Utils::applyAlignment(StackSize + maxOutArgsSizeBytes(),
   1157                                     RequiredStackAlignment);
   1158   SpillAreaSizeBytes = StackSize - StackOffset;
   1159 
   1160   if (SpillAreaSizeBytes) {
   1161     // Generate "sub stackptr, SpillAreaSizeBytes"
   1162     _sub_sp(Ctx->getConstantInt32(SpillAreaSizeBytes));
   1163   }
   1164 
   1165   // If the required alignment is greater than the stack pointer's guaranteed
   1166   // alignment, align the stack pointer accordingly.
   1167   if (RequiredStackAlignment > Traits::X86_STACK_ALIGNMENT_BYTES) {
   1168     assert(IsEbpBasedFrame);
   1169     _and(getPhysicalRegister(getStackReg(), Traits::WordType),
   1170          Ctx->getConstantInt32(-RequiredStackAlignment));
   1171   }
   1172 
   1173   // Account for known-frame-offset alloca instructions that were not already
   1174   // combined into the prolog.
   1175   if (!PrologEmitsFixedAllocas)
   1176     SpillAreaSizeBytes += FixedAllocaSizeBytes;
   1177 
   1178   Ctx->statsUpdateFrameBytes(SpillAreaSizeBytes);
   1179 
   1180   // Fill in stack offsets for stack args, and copy args into registers for
   1181   // those that were register-allocated. Args are pushed right to left, so
   1182   // Arg[0] is closest to the stack/frame pointer.
   1183   RegNumT FrameOrStackReg = IsEbpBasedFrame ? getFrameReg() : getStackReg();
   1184   Variable *FramePtr = getPhysicalRegister(FrameOrStackReg, Traits::WordType);
   1185   size_t BasicFrameOffset =
   1186       PreservedRegsSizeBytes + Traits::X86_RET_IP_SIZE_BYTES;
   1187   if (!IsEbpBasedFrame)
   1188     BasicFrameOffset += SpillAreaSizeBytes;
   1189 
   1190   emitGetIP(Node);
   1191 
   1192   const VarList &Args = Func->getArgs();
   1193   size_t InArgsSizeBytes = 0;
   1194   unsigned NumXmmArgs = 0;
   1195   unsigned NumGPRArgs = 0;
   1196   for (Variable *Arg : Args) {
   1197     // Skip arguments passed in registers.
   1198     if (isVectorType(Arg->getType())) {
   1199       if (Traits::getRegisterForXmmArgNum(NumXmmArgs).hasValue()) {
   1200         ++NumXmmArgs;
   1201         continue;
   1202       }
   1203     } else if (isScalarFloatingType(Arg->getType())) {
   1204       if (Traits::X86_PASS_SCALAR_FP_IN_XMM &&
   1205           Traits::getRegisterForXmmArgNum(NumXmmArgs).hasValue()) {
   1206         ++NumXmmArgs;
   1207         continue;
   1208       }
   1209     } else {
   1210       assert(isScalarIntegerType(Arg->getType()));
   1211       if (Traits::getRegisterForGprArgNum(Traits::WordType, NumGPRArgs)
   1212               .hasValue()) {
   1213         ++NumGPRArgs;
   1214         continue;
   1215       }
   1216     }
   1217     // For esp-based frames where the allocas are done outside the prolog, the
   1218     // esp value may not stabilize to its home value until after all the
   1219     // fixed-size alloca instructions have executed.  In this case, a stack
   1220     // adjustment is needed when accessing in-args in order to copy them into
   1221     // registers.
   1222     size_t StackAdjBytes = 0;
   1223     if (!IsEbpBasedFrame && !PrologEmitsFixedAllocas)
   1224       StackAdjBytes -= FixedAllocaSizeBytes;
   1225     finishArgumentLowering(Arg, FramePtr, BasicFrameOffset, StackAdjBytes,
   1226                            InArgsSizeBytes);
   1227   }
   1228 
   1229   // Fill in stack offsets for locals.
   1230   assignVarStackSlots(SortedSpilledVariables, SpillAreaPaddingBytes,
   1231                       SpillAreaSizeBytes, GlobalsAndSubsequentPaddingSize,
   1232                       IsEbpBasedFrame && !needsStackPointerAlignment());
   1233   // Assign stack offsets to variables that have been linked to spilled
   1234   // variables.
   1235   for (Variable *Var : VariablesLinkedToSpillSlots) {
   1236     const Variable *Root = Var->getLinkedToStackRoot();
   1237     assert(Root != nullptr);
   1238     Var->setStackOffset(Root->getStackOffset());
   1239   }
   1240   this->HasComputedFrame = true;
   1241 
   1242   if (BuildDefs::dump() && Func->isVerbose(IceV_Frame)) {
   1243     OstreamLocker L(Func->getContext());
   1244     Ostream &Str = Func->getContext()->getStrDump();
   1245 
   1246     Str << "Stack layout:\n";
   1247     uint32_t EspAdjustmentPaddingSize =
   1248         SpillAreaSizeBytes - LocalsSpillAreaSize -
   1249         GlobalsAndSubsequentPaddingSize - SpillAreaPaddingBytes -
   1250         maxOutArgsSizeBytes();
   1251     Str << " in-args = " << InArgsSizeBytes << " bytes\n"
   1252         << " return address = " << Traits::X86_RET_IP_SIZE_BYTES << " bytes\n"
   1253         << " preserved registers = " << PreservedRegsSizeBytes << " bytes\n"
   1254         << " spill area padding = " << SpillAreaPaddingBytes << " bytes\n"
   1255         << " globals spill area = " << GlobalsSize << " bytes\n"
   1256         << " globals-locals spill areas intermediate padding = "
   1257         << GlobalsAndSubsequentPaddingSize - GlobalsSize << " bytes\n"
   1258         << " locals spill area = " << LocalsSpillAreaSize << " bytes\n"
   1259         << " esp alignment padding = " << EspAdjustmentPaddingSize
   1260         << " bytes\n";
   1261 
   1262     Str << "Stack details:\n"
   1263         << " esp adjustment = " << SpillAreaSizeBytes << " bytes\n"
   1264         << " spill area alignment = " << SpillAreaAlignmentBytes << " bytes\n"
   1265         << " outgoing args size = " << maxOutArgsSizeBytes() << " bytes\n"
   1266         << " locals spill area alignment = " << LocalsSlotsAlignmentBytes
   1267         << " bytes\n"
   1268         << " is ebp based = " << IsEbpBasedFrame << "\n";
   1269   }
   1270 }
   1271 
   1272 /// Helper function for addProlog().
   1273 ///
   1274 /// This assumes Arg is an argument passed on the stack. This sets the frame
   1275 /// offset for Arg and updates InArgsSizeBytes according to Arg's width. For an
   1276 /// I64 arg that has been split into Lo and Hi components, it calls itself
   1277 /// recursively on the components, taking care to handle Lo first because of the
   1278 /// little-endian architecture. Lastly, this function generates an instruction
   1279 /// to copy Arg into its assigned register if applicable.
   1280 template <typename TraitsType>
   1281 void TargetX86Base<TraitsType>::finishArgumentLowering(
   1282     Variable *Arg, Variable *FramePtr, size_t BasicFrameOffset,
   1283     size_t StackAdjBytes, size_t &InArgsSizeBytes) {
   1284   if (!Traits::Is64Bit) {
   1285     if (auto *Arg64On32 = llvm::dyn_cast<Variable64On32>(Arg)) {
   1286       Variable *Lo = Arg64On32->getLo();
   1287       Variable *Hi = Arg64On32->getHi();
   1288       finishArgumentLowering(Lo, FramePtr, BasicFrameOffset, StackAdjBytes,
   1289                              InArgsSizeBytes);
   1290       finishArgumentLowering(Hi, FramePtr, BasicFrameOffset, StackAdjBytes,
   1291                              InArgsSizeBytes);
   1292       return;
   1293     }
   1294   }
   1295   Type Ty = Arg->getType();
   1296   if (isVectorType(Ty)) {
   1297     InArgsSizeBytes = Traits::applyStackAlignment(InArgsSizeBytes);
   1298   }
   1299   Arg->setStackOffset(BasicFrameOffset + InArgsSizeBytes);
   1300   InArgsSizeBytes += typeWidthInBytesOnStack(Ty);
   1301   if (Arg->hasReg()) {
   1302     assert(Ty != IceType_i64 || Traits::Is64Bit);
   1303     auto *Mem = X86OperandMem::create(
   1304         Func, Ty, FramePtr,
   1305         Ctx->getConstantInt32(Arg->getStackOffset() + StackAdjBytes));
   1306     if (isVectorType(Arg->getType())) {
   1307       _movp(Arg, Mem);
   1308     } else {
   1309       _mov(Arg, Mem);
   1310     }
   1311     // This argument-copying instruction uses an explicit X86OperandMem
   1312     // operand instead of a Variable, so its fill-from-stack operation has to
   1313     // be tracked separately for statistics.
   1314     Ctx->statsUpdateFills();
   1315   }
   1316 }
   1317 
   1318 template <typename TraitsType>
   1319 void TargetX86Base<TraitsType>::addEpilog(CfgNode *Node) {
   1320   InstList &Insts = Node->getInsts();
   1321   InstList::reverse_iterator RI, E;
   1322   for (RI = Insts.rbegin(), E = Insts.rend(); RI != E; ++RI) {
   1323     if (llvm::isa<typename Traits::Insts::Ret>(*RI))
   1324       break;
   1325   }
   1326   if (RI == E)
   1327     return;
   1328 
   1329   // Convert the reverse_iterator position into its corresponding (forward)
   1330   // iterator position.
   1331   InstList::iterator InsertPoint = reverseToForwardIterator(RI);
   1332   --InsertPoint;
   1333   Context.init(Node);
   1334   Context.setInsertPoint(InsertPoint);
   1335 
   1336   if (IsEbpBasedFrame) {
   1337     _unlink_bp();
   1338   } else {
   1339     // add stackptr, SpillAreaSizeBytes
   1340     if (SpillAreaSizeBytes != 0) {
   1341       _add_sp(Ctx->getConstantInt32(SpillAreaSizeBytes));
   1342     }
   1343   }
   1344 
   1345   // Add pop instructions for preserved registers.
   1346   SmallBitVector CalleeSaves = getRegisterSet(RegSet_CalleeSave, RegSet_None);
   1347   SmallBitVector Popped(CalleeSaves.size());
   1348   for (int32_t i = CalleeSaves.size() - 1; i >= 0; --i) {
   1349     const auto RegNum = RegNumT::fromInt(i);
   1350     if (RegNum == getFrameReg() && IsEbpBasedFrame)
   1351       continue;
   1352     const RegNumT Canonical = Traits::getBaseReg(RegNum);
   1353     if (CalleeSaves[i] && RegsUsed[i]) {
   1354       Popped[Canonical] = true;
   1355     }
   1356   }
   1357   for (int32_t i = Popped.size() - 1; i >= 0; --i) {
   1358     if (!Popped[i])
   1359       continue;
   1360     const auto RegNum = RegNumT::fromInt(i);
   1361     assert(RegNum == Traits::getBaseReg(RegNum));
   1362     _pop(getPhysicalRegister(RegNum, Traits::WordType));
   1363   }
   1364 
   1365   if (!NeedSandboxing) {
   1366     return;
   1367   }
   1368   emitSandboxedReturn();
   1369   if (RI->getSrcSize()) {
   1370     auto *RetValue = llvm::cast<Variable>(RI->getSrc(0));
   1371     Context.insert<InstFakeUse>(RetValue);
   1372   }
   1373   RI->setDeleted();
   1374 }
   1375 
   1376 template <typename TraitsType> Type TargetX86Base<TraitsType>::stackSlotType() {
   1377   return Traits::WordType;
   1378 }
   1379 
   1380 template <typename TraitsType>
   1381 template <typename T>
   1382 typename std::enable_if<!T::Is64Bit, Operand>::type *
   1383 TargetX86Base<TraitsType>::loOperand(Operand *Operand) {
   1384   assert(Operand->getType() == IceType_i64 ||
   1385          Operand->getType() == IceType_f64);
   1386   if (Operand->getType() != IceType_i64 && Operand->getType() != IceType_f64)
   1387     return Operand;
   1388   if (auto *Var64On32 = llvm::dyn_cast<Variable64On32>(Operand))
   1389     return Var64On32->getLo();
   1390   if (auto *Const = llvm::dyn_cast<ConstantInteger64>(Operand)) {
   1391     auto *ConstInt = llvm::dyn_cast<ConstantInteger32>(
   1392         Ctx->getConstantInt32(static_cast<int32_t>(Const->getValue())));
   1393     // Check if we need to blind/pool the constant.
   1394     return legalize(ConstInt);
   1395   }
   1396   if (auto *Mem = llvm::dyn_cast<X86OperandMem>(Operand)) {
   1397     auto *MemOperand = X86OperandMem::create(
   1398         Func, IceType_i32, Mem->getBase(), Mem->getOffset(), Mem->getIndex(),
   1399         Mem->getShift(), Mem->getSegmentRegister(), Mem->getIsRebased());
   1400     // Test if we should randomize or pool the offset, if so randomize it or
   1401     // pool it then create mem operand with the blinded/pooled constant.
   1402     // Otherwise, return the mem operand as ordinary mem operand.
   1403     return legalize(MemOperand);
   1404   }
   1405   llvm_unreachable("Unsupported operand type");
   1406   return nullptr;
   1407 }
   1408 
   1409 template <typename TraitsType>
   1410 template <typename T>
   1411 typename std::enable_if<!T::Is64Bit, Operand>::type *
   1412 TargetX86Base<TraitsType>::hiOperand(Operand *Operand) {
   1413   assert(Operand->getType() == IceType_i64 ||
   1414          Operand->getType() == IceType_f64);
   1415   if (Operand->getType() != IceType_i64 && Operand->getType() != IceType_f64)
   1416     return Operand;
   1417   if (auto *Var64On32 = llvm::dyn_cast<Variable64On32>(Operand))
   1418     return Var64On32->getHi();
   1419   if (auto *Const = llvm::dyn_cast<ConstantInteger64>(Operand)) {
   1420     auto *ConstInt = llvm::dyn_cast<ConstantInteger32>(
   1421         Ctx->getConstantInt32(static_cast<int32_t>(Const->getValue() >> 32)));
   1422     // Check if we need to blind/pool the constant.
   1423     return legalize(ConstInt);
   1424   }
   1425   if (auto *Mem = llvm::dyn_cast<X86OperandMem>(Operand)) {
   1426     Constant *Offset = Mem->getOffset();
   1427     if (Offset == nullptr) {
   1428       Offset = Ctx->getConstantInt32(4);
   1429     } else if (auto *IntOffset = llvm::dyn_cast<ConstantInteger32>(Offset)) {
   1430       Offset = Ctx->getConstantInt32(4 + IntOffset->getValue());
   1431     } else if (auto *SymOffset = llvm::dyn_cast<ConstantRelocatable>(Offset)) {
   1432       assert(!Utils::WouldOverflowAdd(SymOffset->getOffset(), 4));
   1433       Offset =
   1434           Ctx->getConstantSym(4 + SymOffset->getOffset(), SymOffset->getName());
   1435     }
   1436     auto *MemOperand = X86OperandMem::create(
   1437         Func, IceType_i32, Mem->getBase(), Offset, Mem->getIndex(),
   1438         Mem->getShift(), Mem->getSegmentRegister(), Mem->getIsRebased());
   1439     // Test if the Offset is an eligible i32 constants for randomization and
   1440     // pooling. Blind/pool it if it is. Otherwise return as oridinary mem
   1441     // operand.
   1442     return legalize(MemOperand);
   1443   }
   1444   llvm_unreachable("Unsupported operand type");
   1445   return nullptr;
   1446 }
   1447 
   1448 template <typename TraitsType>
   1449 SmallBitVector
   1450 TargetX86Base<TraitsType>::getRegisterSet(RegSetMask Include,
   1451                                           RegSetMask Exclude) const {
   1452   return Traits::getRegisterSet(getFlags(), Include, Exclude);
   1453 }
   1454 
   1455 template <typename TraitsType>
   1456 void TargetX86Base<TraitsType>::lowerAlloca(const InstAlloca *Instr) {
   1457   // Conservatively require the stack to be aligned. Some stack adjustment
   1458   // operations implemented below assume that the stack is aligned before the
   1459   // alloca. All the alloca code ensures that the stack alignment is preserved
   1460   // after the alloca. The stack alignment restriction can be relaxed in some
   1461   // cases.
   1462   RequiredStackAlignment = std::max<size_t>(RequiredStackAlignment,
   1463                                             Traits::X86_STACK_ALIGNMENT_BYTES);
   1464 
   1465   // For default align=0, set it to the real value 1, to avoid any
   1466   // bit-manipulation problems below.
   1467   const uint32_t AlignmentParam = std::max(1u, Instr->getAlignInBytes());
   1468 
   1469   // LLVM enforces power of 2 alignment.
   1470   assert(llvm::isPowerOf2_32(AlignmentParam));
   1471   assert(llvm::isPowerOf2_32(Traits::X86_STACK_ALIGNMENT_BYTES));
   1472 
   1473   const uint32_t Alignment =
   1474       std::max(AlignmentParam, Traits::X86_STACK_ALIGNMENT_BYTES);
   1475   const bool OverAligned = Alignment > Traits::X86_STACK_ALIGNMENT_BYTES;
   1476   const bool OptM1 = Func->getOptLevel() == Opt_m1;
   1477   const bool AllocaWithKnownOffset = Instr->getKnownFrameOffset();
   1478   const bool UseFramePointer =
   1479       hasFramePointer() || OverAligned || !AllocaWithKnownOffset || OptM1;
   1480 
   1481   if (UseFramePointer)
   1482     setHasFramePointer();
   1483 
   1484   Variable *esp = getPhysicalRegister(getStackReg(), Traits::WordType);
   1485   if (OverAligned) {
   1486     _and(esp, Ctx->getConstantInt32(-Alignment));
   1487   }
   1488 
   1489   Variable *Dest = Instr->getDest();
   1490   Operand *TotalSize = legalize(Instr->getSizeInBytes());
   1491 
   1492   if (const auto *ConstantTotalSize =
   1493           llvm::dyn_cast<ConstantInteger32>(TotalSize)) {
   1494     const uint32_t Value =
   1495         Utils::applyAlignment(ConstantTotalSize->getValue(), Alignment);
   1496     if (UseFramePointer) {
   1497       _sub_sp(Ctx->getConstantInt32(Value));
   1498     } else {
   1499       // If we don't need a Frame Pointer, this alloca has a known offset to the
   1500       // stack pointer. We don't need adjust the stack pointer, nor assign any
   1501       // value to Dest, as Dest is rematerializable.
   1502       assert(Dest->isRematerializable());
   1503       FixedAllocaSizeBytes += Value;
   1504       Context.insert<InstFakeDef>(Dest);
   1505     }
   1506   } else {
   1507     // Non-constant sizes need to be adjusted to the next highest multiple of
   1508     // the required alignment at runtime.
   1509     Variable *T = nullptr;
   1510     if (Traits::Is64Bit && TotalSize->getType() != IceType_i64 &&
   1511         !NeedSandboxing) {
   1512       T = makeReg(IceType_i64);
   1513       _movzx(T, TotalSize);
   1514     } else {
   1515       T = makeReg(IceType_i32);
   1516       _mov(T, TotalSize);
   1517     }
   1518     _add(T, Ctx->getConstantInt32(Alignment - 1));
   1519     _and(T, Ctx->getConstantInt32(-Alignment));
   1520     _sub_sp(T);
   1521   }
   1522   // Add enough to the returned address to account for the out args area.
   1523   uint32_t OutArgsSize = maxOutArgsSizeBytes();
   1524   if (OutArgsSize > 0) {
   1525     Variable *T = makeReg(IceType_i32);
   1526     auto *CalculateOperand = X86OperandMem::create(
   1527         Func, IceType_void, esp, Ctx->getConstantInt(IceType_i32, OutArgsSize));
   1528     _lea(T, CalculateOperand);
   1529     _mov(Dest, T);
   1530   } else {
   1531     _mov(Dest, esp);
   1532   }
   1533 }
   1534 
   1535 template <typename TraitsType>
   1536 void TargetX86Base<TraitsType>::lowerArguments() {
   1537   const bool OptM1 = Func->getOptLevel() == Opt_m1;
   1538   VarList &Args = Func->getArgs();
   1539   unsigned NumXmmArgs = 0;
   1540   bool XmmSlotsRemain = true;
   1541   unsigned NumGprArgs = 0;
   1542   bool GprSlotsRemain = true;
   1543 
   1544   Context.init(Func->getEntryNode());
   1545   Context.setInsertPoint(Context.getCur());
   1546 
   1547   for (SizeT i = 0, End = Args.size();
   1548        i < End && (XmmSlotsRemain || GprSlotsRemain); ++i) {
   1549     Variable *Arg = Args[i];
   1550     Type Ty = Arg->getType();
   1551     Variable *RegisterArg = nullptr;
   1552     RegNumT RegNum;
   1553     if (isVectorType(Ty)) {
   1554       RegNum = Traits::getRegisterForXmmArgNum(NumXmmArgs);
   1555       if (RegNum.hasNoValue()) {
   1556         XmmSlotsRemain = false;
   1557         continue;
   1558       }
   1559       ++NumXmmArgs;
   1560       RegisterArg = Func->makeVariable(Ty);
   1561     } else if (isScalarFloatingType(Ty)) {
   1562       if (!Traits::X86_PASS_SCALAR_FP_IN_XMM) {
   1563         continue;
   1564       }
   1565       RegNum = Traits::getRegisterForXmmArgNum(NumXmmArgs);
   1566       if (RegNum.hasNoValue()) {
   1567         XmmSlotsRemain = false;
   1568         continue;
   1569       }
   1570       ++NumXmmArgs;
   1571       RegisterArg = Func->makeVariable(Ty);
   1572     } else if (isScalarIntegerType(Ty)) {
   1573       RegNum = Traits::getRegisterForGprArgNum(Ty, NumGprArgs);
   1574       if (RegNum.hasNoValue()) {
   1575         GprSlotsRemain = false;
   1576         continue;
   1577       }
   1578       ++NumGprArgs;
   1579       RegisterArg = Func->makeVariable(Ty);
   1580     }
   1581     assert(RegNum.hasValue());
   1582     assert(RegisterArg != nullptr);
   1583     // Replace Arg in the argument list with the home register. Then generate
   1584     // an instruction in the prolog to copy the home register to the assigned
   1585     // location of Arg.
   1586     if (BuildDefs::dump())
   1587       RegisterArg->setName(Func, "home_reg:" + Arg->getName());
   1588     RegisterArg->setRegNum(RegNum);
   1589     RegisterArg->setIsArg();
   1590     Arg->setIsArg(false);
   1591 
   1592     Args[i] = RegisterArg;
   1593     // When not Om1, do the assignment through a temporary, instead of directly
   1594     // from the pre-colored variable, so that a subsequent availabilityGet()
   1595     // call has a chance to work.  (In Om1, don't bother creating extra
   1596     // instructions with extra variables to register-allocate.)
   1597     if (OptM1) {
   1598       Context.insert<InstAssign>(Arg, RegisterArg);
   1599     } else {
   1600       Variable *Tmp = makeReg(RegisterArg->getType());
   1601       Context.insert<InstAssign>(Tmp, RegisterArg);
   1602       Context.insert<InstAssign>(Arg, Tmp);
   1603     }
   1604   }
   1605   if (!OptM1)
   1606     Context.availabilityUpdate();
   1607 }
   1608 
   1609 /// Strength-reduce scalar integer multiplication by a constant (for i32 or
   1610 /// narrower) for certain constants. The lea instruction can be used to multiply
   1611 /// by 3, 5, or 9, and the lsh instruction can be used to multiply by powers of
   1612 /// 2. These can be combined such that e.g. multiplying by 100 can be done as 2
   1613 /// lea-based multiplies by 5, combined with left-shifting by 2.
   1614 template <typename TraitsType>
   1615 bool TargetX86Base<TraitsType>::optimizeScalarMul(Variable *Dest, Operand *Src0,
   1616                                                   int32_t Src1) {
   1617   // Disable this optimization for Om1 and O0, just to keep things simple
   1618   // there.
   1619   if (Func->getOptLevel() < Opt_1)
   1620     return false;
   1621   Type Ty = Dest->getType();
   1622   if (Src1 == -1) {
   1623     Variable *T = nullptr;
   1624     _mov(T, Src0);
   1625     _neg(T);
   1626     _mov(Dest, T);
   1627     return true;
   1628   }
   1629   if (Src1 == 0) {
   1630     _mov(Dest, Ctx->getConstantZero(Ty));
   1631     return true;
   1632   }
   1633   if (Src1 == 1) {
   1634     Variable *T = nullptr;
   1635     _mov(T, Src0);
   1636     _mov(Dest, T);
   1637     return true;
   1638   }
   1639   // Don't bother with the edge case where Src1 == MININT.
   1640   if (Src1 == -Src1)
   1641     return false;
   1642   const bool Src1IsNegative = Src1 < 0;
   1643   if (Src1IsNegative)
   1644     Src1 = -Src1;
   1645   uint32_t Count9 = 0;
   1646   uint32_t Count5 = 0;
   1647   uint32_t Count3 = 0;
   1648   uint32_t Count2 = 0;
   1649   uint32_t CountOps = 0;
   1650   while (Src1 > 1) {
   1651     if (Src1 % 9 == 0) {
   1652       ++CountOps;
   1653       ++Count9;
   1654       Src1 /= 9;
   1655     } else if (Src1 % 5 == 0) {
   1656       ++CountOps;
   1657       ++Count5;
   1658       Src1 /= 5;
   1659     } else if (Src1 % 3 == 0) {
   1660       ++CountOps;
   1661       ++Count3;
   1662       Src1 /= 3;
   1663     } else if (Src1 % 2 == 0) {
   1664       if (Count2 == 0)
   1665         ++CountOps;
   1666       ++Count2;
   1667       Src1 /= 2;
   1668     } else {
   1669       return false;
   1670     }
   1671   }
   1672   // Lea optimization only works for i16 and i32 types, not i8.
   1673   if (Ty != IceType_i32 && !(Traits::Is64Bit && Ty == IceType_i64) &&
   1674       (Count3 || Count5 || Count9))
   1675     return false;
   1676   // Limit the number of lea/shl operations for a single multiply, to a
   1677   // somewhat arbitrary choice of 3.
   1678   constexpr uint32_t MaxOpsForOptimizedMul = 3;
   1679   if (CountOps > MaxOpsForOptimizedMul)
   1680     return false;
   1681   Variable *T = makeReg(Traits::WordType);
   1682   if (typeWidthInBytes(Src0->getType()) < typeWidthInBytes(T->getType())) {
   1683     Operand *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
   1684     _movzx(T, Src0RM);
   1685   } else {
   1686     _mov(T, Src0);
   1687   }
   1688   Constant *Zero = Ctx->getConstantZero(IceType_i32);
   1689   for (uint32_t i = 0; i < Count9; ++i) {
   1690     constexpr uint16_t Shift = 3; // log2(9-1)
   1691     _lea(T, X86OperandMem::create(Func, IceType_void, T, Zero, T, Shift));
   1692   }
   1693   for (uint32_t i = 0; i < Count5; ++i) {
   1694     constexpr uint16_t Shift = 2; // log2(5-1)
   1695     _lea(T, X86OperandMem::create(Func, IceType_void, T, Zero, T, Shift));
   1696   }
   1697   for (uint32_t i = 0; i < Count3; ++i) {
   1698     constexpr uint16_t Shift = 1; // log2(3-1)
   1699     _lea(T, X86OperandMem::create(Func, IceType_void, T, Zero, T, Shift));
   1700   }
   1701   if (Count2) {
   1702     _shl(T, Ctx->getConstantInt(Ty, Count2));
   1703   }
   1704   if (Src1IsNegative)
   1705     _neg(T);
   1706   _mov(Dest, T);
   1707   return true;
   1708 }
   1709 
   1710 template <typename TraitsType>
   1711 void TargetX86Base<TraitsType>::lowerShift64(InstArithmetic::OpKind Op,
   1712                                              Operand *Src0Lo, Operand *Src0Hi,
   1713                                              Operand *Src1Lo, Variable *DestLo,
   1714                                              Variable *DestHi) {
   1715   // TODO: Refactor the similarities between Shl, Lshr, and Ashr.
   1716   Variable *T_1 = nullptr, *T_2 = nullptr, *T_3 = nullptr;
   1717   Constant *Zero = Ctx->getConstantZero(IceType_i32);
   1718   Constant *SignExtend = Ctx->getConstantInt32(0x1f);
   1719   if (auto *ConstantShiftAmount = llvm::dyn_cast<ConstantInteger32>(Src1Lo)) {
   1720     uint32_t ShiftAmount = ConstantShiftAmount->getValue();
   1721     if (ShiftAmount > 32) {
   1722       Constant *ReducedShift = Ctx->getConstantInt32(ShiftAmount - 32);
   1723       switch (Op) {
   1724       default:
   1725         assert(0 && "non-shift op");
   1726         break;
   1727       case InstArithmetic::Shl: {
   1728         // a=b<<c ==>
   1729         //   t2 = b.lo
   1730         //   t2 = shl t2, ShiftAmount-32
   1731         //   t3 = t2
   1732         //   t2 = 0
   1733         _mov(T_2, Src0Lo);
   1734         _shl(T_2, ReducedShift);
   1735         _mov(DestHi, T_2);
   1736         _mov(DestLo, Zero);
   1737       } break;
   1738       case InstArithmetic::Lshr: {
   1739         // a=b>>c (unsigned) ==>
   1740         //   t2 = b.hi
   1741         //   t2 = shr t2, ShiftAmount-32
   1742         //   a.lo = t2
   1743         //   a.hi = 0
   1744         _mov(T_2, Src0Hi);
   1745         _shr(T_2, ReducedShift);
   1746         _mov(DestLo, T_2);
   1747         _mov(DestHi, Zero);
   1748       } break;
   1749       case InstArithmetic::Ashr: {
   1750         // a=b>>c (signed) ==>
   1751         //   t3 = b.hi
   1752         //   t3 = sar t3, 0x1f
   1753         //   t2 = b.hi
   1754         //   t2 = shrd t2, t3, ShiftAmount-32
   1755         //   a.lo = t2
   1756         //   a.hi = t3
   1757         _mov(T_3, Src0Hi);
   1758         _sar(T_3, SignExtend);
   1759         _mov(T_2, Src0Hi);
   1760         _shrd(T_2, T_3, ReducedShift);
   1761         _mov(DestLo, T_2);
   1762         _mov(DestHi, T_3);
   1763       } break;
   1764       }
   1765     } else if (ShiftAmount == 32) {
   1766       switch (Op) {
   1767       default:
   1768         assert(0 && "non-shift op");
   1769         break;
   1770       case InstArithmetic::Shl: {
   1771         // a=b<<c ==>
   1772         //   t2 = b.lo
   1773         //   a.hi = t2
   1774         //   a.lo = 0
   1775         _mov(T_2, Src0Lo);
   1776         _mov(DestHi, T_2);
   1777         _mov(DestLo, Zero);
   1778       } break;
   1779       case InstArithmetic::Lshr: {
   1780         // a=b>>c (unsigned) ==>
   1781         //   t2 = b.hi
   1782         //   a.lo = t2
   1783         //   a.hi = 0
   1784         _mov(T_2, Src0Hi);
   1785         _mov(DestLo, T_2);
   1786         _mov(DestHi, Zero);
   1787       } break;
   1788       case InstArithmetic::Ashr: {
   1789         // a=b>>c (signed) ==>
   1790         //   t2 = b.hi
   1791         //   a.lo = t2
   1792         //   t3 = b.hi
   1793         //   t3 = sar t3, 0x1f
   1794         //   a.hi = t3
   1795         _mov(T_2, Src0Hi);
   1796         _mov(DestLo, T_2);
   1797         _mov(T_3, Src0Hi);
   1798         _sar(T_3, SignExtend);
   1799         _mov(DestHi, T_3);
   1800       } break;
   1801       }
   1802     } else {
   1803       // COMMON PREFIX OF: a=b SHIFT_OP c ==>
   1804       //   t2 = b.lo
   1805       //   t3 = b.hi
   1806       _mov(T_2, Src0Lo);
   1807       _mov(T_3, Src0Hi);
   1808       switch (Op) {
   1809       default:
   1810         assert(0 && "non-shift op");
   1811         break;
   1812       case InstArithmetic::Shl: {
   1813         // a=b<<c ==>
   1814         //   t3 = shld t3, t2, ShiftAmount
   1815         //   t2 = shl t2, ShiftAmount
   1816         _shld(T_3, T_2, ConstantShiftAmount);
   1817         _shl(T_2, ConstantShiftAmount);
   1818       } break;
   1819       case InstArithmetic::Lshr: {
   1820         // a=b>>c (unsigned) ==>
   1821         //   t2 = shrd t2, t3, ShiftAmount
   1822         //   t3 = shr t3, ShiftAmount
   1823         _shrd(T_2, T_3, ConstantShiftAmount);
   1824         _shr(T_3, ConstantShiftAmount);
   1825       } break;
   1826       case InstArithmetic::Ashr: {
   1827         // a=b>>c (signed) ==>
   1828         //   t2 = shrd t2, t3, ShiftAmount
   1829         //   t3 = sar t3, ShiftAmount
   1830         _shrd(T_2, T_3, ConstantShiftAmount);
   1831         _sar(T_3, ConstantShiftAmount);
   1832       } break;
   1833       }
   1834       // COMMON SUFFIX OF: a=b SHIFT_OP c ==>
   1835       //   a.lo = t2
   1836       //   a.hi = t3
   1837       _mov(DestLo, T_2);
   1838       _mov(DestHi, T_3);
   1839     }
   1840   } else {
   1841     // NON-CONSTANT CASES.
   1842     Constant *BitTest = Ctx->getConstantInt32(0x20);
   1843     InstX86Label *Label = InstX86Label::create(Func, this);
   1844     // COMMON PREFIX OF: a=b SHIFT_OP c ==>
   1845     //   t1:ecx = c.lo & 0xff
   1846     //   t2 = b.lo
   1847     //   t3 = b.hi
   1848     T_1 = copyToReg8(Src1Lo, Traits::RegisterSet::Reg_cl);
   1849     _mov(T_2, Src0Lo);
   1850     _mov(T_3, Src0Hi);
   1851     switch (Op) {
   1852     default:
   1853       assert(0 && "non-shift op");
   1854       break;
   1855     case InstArithmetic::Shl: {
   1856       // a=b<<c ==>
   1857       //   t3 = shld t3, t2, t1
   1858       //   t2 = shl t2, t1
   1859       //   test t1, 0x20
   1860       //   je L1
   1861       //   use(t3)
   1862       //   t3 = t2
   1863       //   t2 = 0
   1864       _shld(T_3, T_2, T_1);
   1865       _shl(T_2, T_1);
   1866       _test(T_1, BitTest);
   1867       _br(Traits::Cond::Br_e, Label);
   1868       // T_2 and T_3 are being assigned again because of the intra-block control
   1869       // flow, so we need to use _redefined to avoid liveness problems.
   1870       _redefined(_mov(T_3, T_2));
   1871       _redefined(_mov(T_2, Zero));
   1872     } break;
   1873     case InstArithmetic::Lshr: {
   1874       // a=b>>c (unsigned) ==>
   1875       //   t2 = shrd t2, t3, t1
   1876       //   t3 = shr t3, t1
   1877       //   test t1, 0x20
   1878       //   je L1
   1879       //   use(t2)
   1880       //   t2 = t3
   1881       //   t3 = 0
   1882       _shrd(T_2, T_3, T_1);
   1883       _shr(T_3, T_1);
   1884       _test(T_1, BitTest);
   1885       _br(Traits::Cond::Br_e, Label);
   1886       // T_2 and T_3 are being assigned again because of the intra-block control
   1887       // flow, so we need to use _redefined to avoid liveness problems.
   1888       _redefined(_mov(T_2, T_3));
   1889       _redefined(_mov(T_3, Zero));
   1890     } break;
   1891     case InstArithmetic::Ashr: {
   1892       // a=b>>c (signed) ==>
   1893       //   t2 = shrd t2, t3, t1
   1894       //   t3 = sar t3, t1
   1895       //   test t1, 0x20
   1896       //   je L1
   1897       //   use(t2)
   1898       //   t2 = t3
   1899       //   t3 = sar t3, 0x1f
   1900       Constant *SignExtend = Ctx->getConstantInt32(0x1f);
   1901       _shrd(T_2, T_3, T_1);
   1902       _sar(T_3, T_1);
   1903       _test(T_1, BitTest);
   1904       _br(Traits::Cond::Br_e, Label);
   1905       // T_2 and T_3 are being assigned again because of the intra-block control
   1906       // flow, so T_2 needs to use _redefined to avoid liveness problems. T_3
   1907       // doesn't need special treatment because it is reassigned via _sar
   1908       // instead of _mov.
   1909       _redefined(_mov(T_2, T_3));
   1910       _sar(T_3, SignExtend);
   1911     } break;
   1912     }
   1913     // COMMON SUFFIX OF: a=b SHIFT_OP c ==>
   1914     // L1:
   1915     //   a.lo = t2
   1916     //   a.hi = t3
   1917     Context.insert(Label);
   1918     _mov(DestLo, T_2);
   1919     _mov(DestHi, T_3);
   1920   }
   1921 }
   1922 
   1923 template <typename TraitsType>
   1924 void TargetX86Base<TraitsType>::lowerArithmetic(const InstArithmetic *Instr) {
   1925   Variable *Dest = Instr->getDest();
   1926   if (Dest->isRematerializable()) {
   1927     Context.insert<InstFakeDef>(Dest);
   1928     return;
   1929   }
   1930   Type Ty = Dest->getType();
   1931   Operand *Src0 = legalize(Instr->getSrc(0));
   1932   Operand *Src1 = legalize(Instr->getSrc(1));
   1933   if (Instr->isCommutative()) {
   1934     uint32_t SwapCount = 0;
   1935     if (!llvm::isa<Variable>(Src0) && llvm::isa<Variable>(Src1)) {
   1936       std::swap(Src0, Src1);
   1937       ++SwapCount;
   1938     }
   1939     if (llvm::isa<Constant>(Src0) && !llvm::isa<Constant>(Src1)) {
   1940       std::swap(Src0, Src1);
   1941       ++SwapCount;
   1942     }
   1943     // Improve two-address code patterns by avoiding a copy to the dest
   1944     // register when one of the source operands ends its lifetime here.
   1945     if (!Instr->isLastUse(Src0) && Instr->isLastUse(Src1)) {
   1946       std::swap(Src0, Src1);
   1947       ++SwapCount;
   1948     }
   1949     assert(SwapCount <= 1);
   1950     (void)SwapCount;
   1951   }
   1952   if (!Traits::Is64Bit && Ty == IceType_i64) {
   1953     // These x86-32 helper-call-involved instructions are lowered in this
   1954     // separate switch. This is because loOperand() and hiOperand() may insert
   1955     // redundant instructions for constant blinding and pooling. Such redundant
   1956     // instructions will fail liveness analysis under -Om1 setting. And,
   1957     // actually these arguments do not need to be processed with loOperand()
   1958     // and hiOperand() to be used.
   1959     switch (Instr->getOp()) {
   1960     case InstArithmetic::Udiv:
   1961     case InstArithmetic::Sdiv:
   1962     case InstArithmetic::Urem:
   1963     case InstArithmetic::Srem:
   1964       llvm::report_fatal_error("Helper call was expected");
   1965       return;
   1966     default:
   1967       break;
   1968     }
   1969 
   1970     auto *DestLo = llvm::cast<Variable>(loOperand(Dest));
   1971     auto *DestHi = llvm::cast<Variable>(hiOperand(Dest));
   1972     Operand *Src0Lo = loOperand(Src0);
   1973     Operand *Src0Hi = hiOperand(Src0);
   1974     Operand *Src1Lo = loOperand(Src1);
   1975     Operand *Src1Hi = hiOperand(Src1);
   1976     Variable *T_Lo = nullptr, *T_Hi = nullptr;
   1977     switch (Instr->getOp()) {
   1978     case InstArithmetic::_num:
   1979       llvm_unreachable("Unknown arithmetic operator");
   1980       break;
   1981     case InstArithmetic::Add:
   1982       _mov(T_Lo, Src0Lo);
   1983       _add(T_Lo, Src1Lo);
   1984       _mov(DestLo, T_Lo);
   1985       _mov(T_Hi, Src0Hi);
   1986       _adc(T_Hi, Src1Hi);
   1987       _mov(DestHi, T_Hi);
   1988       break;
   1989     case InstArithmetic::And:
   1990       _mov(T_Lo, Src0Lo);
   1991       _and(T_Lo, Src1Lo);
   1992       _mov(DestLo, T_Lo);
   1993       _mov(T_Hi, Src0Hi);
   1994       _and(T_Hi, Src1Hi);
   1995       _mov(DestHi, T_Hi);
   1996       break;
   1997     case InstArithmetic::Or:
   1998       _mov(T_Lo, Src0Lo);
   1999       _or(T_Lo, Src1Lo);
   2000       _mov(DestLo, T_Lo);
   2001       _mov(T_Hi, Src0Hi);
   2002       _or(T_Hi, Src1Hi);
   2003       _mov(DestHi, T_Hi);
   2004       break;
   2005     case InstArithmetic::Xor:
   2006       _mov(T_Lo, Src0Lo);
   2007       _xor(T_Lo, Src1Lo);
   2008       _mov(DestLo, T_Lo);
   2009       _mov(T_Hi, Src0Hi);
   2010       _xor(T_Hi, Src1Hi);
   2011       _mov(DestHi, T_Hi);
   2012       break;
   2013     case InstArithmetic::Sub:
   2014       _mov(T_Lo, Src0Lo);
   2015       _sub(T_Lo, Src1Lo);
   2016       _mov(DestLo, T_Lo);
   2017       _mov(T_Hi, Src0Hi);
   2018       _sbb(T_Hi, Src1Hi);
   2019       _mov(DestHi, T_Hi);
   2020       break;
   2021     case InstArithmetic::Mul: {
   2022       Variable *T_1 = nullptr, *T_2 = nullptr, *T_3 = nullptr;
   2023       Variable *T_4Lo = makeReg(IceType_i32, Traits::RegisterSet::Reg_eax);
   2024       Variable *T_4Hi = makeReg(IceType_i32, Traits::RegisterSet::Reg_edx);
   2025       // gcc does the following:
   2026       // a=b*c ==>
   2027       //   t1 = b.hi; t1 *=(imul) c.lo
   2028       //   t2 = c.hi; t2 *=(imul) b.lo
   2029       //   t3:eax = b.lo
   2030       //   t4.hi:edx,t4.lo:eax = t3:eax *(mul) c.lo
   2031       //   a.lo = t4.lo
   2032       //   t4.hi += t1
   2033       //   t4.hi += t2
   2034       //   a.hi = t4.hi
   2035       // The mul instruction cannot take an immediate operand.
   2036       Src1Lo = legalize(Src1Lo, Legal_Reg | Legal_Mem);
   2037       _mov(T_1, Src0Hi);
   2038       _imul(T_1, Src1Lo);
   2039       _mov(T_3, Src0Lo, Traits::RegisterSet::Reg_eax);
   2040       _mul(T_4Lo, T_3, Src1Lo);
   2041       // The mul instruction produces two dest variables, edx:eax. We create a
   2042       // fake definition of edx to account for this.
   2043       Context.insert<InstFakeDef>(T_4Hi, T_4Lo);
   2044       Context.insert<InstFakeUse>(T_4Hi);
   2045       _mov(DestLo, T_4Lo);
   2046       _add(T_4Hi, T_1);
   2047       _mov(T_2, Src1Hi);
   2048       _imul(T_2, Src0Lo);
   2049       _add(T_4Hi, T_2);
   2050       _mov(DestHi, T_4Hi);
   2051     } break;
   2052     case InstArithmetic::Shl:
   2053     case InstArithmetic::Lshr:
   2054     case InstArithmetic::Ashr:
   2055       lowerShift64(Instr->getOp(), Src0Lo, Src0Hi, Src1Lo, DestLo, DestHi);
   2056       break;
   2057     case InstArithmetic::Fadd:
   2058     case InstArithmetic::Fsub:
   2059     case InstArithmetic::Fmul:
   2060     case InstArithmetic::Fdiv:
   2061     case InstArithmetic::Frem:
   2062       llvm_unreachable("FP instruction with i64 type");
   2063       break;
   2064     case InstArithmetic::Udiv:
   2065     case InstArithmetic::Sdiv:
   2066     case InstArithmetic::Urem:
   2067     case InstArithmetic::Srem:
   2068       llvm_unreachable("Call-helper-involved instruction for i64 type \
   2069                        should have already been handled before");
   2070       break;
   2071     }
   2072     return;
   2073   }
   2074   if (isVectorType(Ty)) {
   2075     // TODO: Trap on integer divide and integer modulo by zero. See:
   2076     // https://code.google.com/p/nativeclient/issues/detail?id=3899
   2077     if (llvm::isa<X86OperandMem>(Src1))
   2078       Src1 = legalizeToReg(Src1);
   2079     switch (Instr->getOp()) {
   2080     case InstArithmetic::_num:
   2081       llvm_unreachable("Unknown arithmetic operator");
   2082       break;
   2083     case InstArithmetic::Add: {
   2084       Variable *T = makeReg(Ty);
   2085       _movp(T, Src0);
   2086       _padd(T, Src1);
   2087       _movp(Dest, T);
   2088     } break;
   2089     case InstArithmetic::And: {
   2090       Variable *T = makeReg(Ty);
   2091       _movp(T, Src0);
   2092       _pand(T, Src1);
   2093       _movp(Dest, T);
   2094     } break;
   2095     case InstArithmetic::Or: {
   2096       Variable *T = makeReg(Ty);
   2097       _movp(T, Src0);
   2098       _por(T, Src1);
   2099       _movp(Dest, T);
   2100     } break;
   2101     case InstArithmetic::Xor: {
   2102       Variable *T = makeReg(Ty);
   2103       _movp(T, Src0);
   2104       _pxor(T, Src1);
   2105       _movp(Dest, T);
   2106     } break;
   2107     case InstArithmetic::Sub: {
   2108       Variable *T = makeReg(Ty);
   2109       _movp(T, Src0);
   2110       _psub(T, Src1);
   2111       _movp(Dest, T);
   2112     } break;
   2113     case InstArithmetic::Mul: {
   2114       bool TypesAreValidForPmull = Ty == IceType_v4i32 || Ty == IceType_v8i16;
   2115       bool InstructionSetIsValidForPmull =
   2116           Ty == IceType_v8i16 || InstructionSet >= Traits::SSE4_1;
   2117       if (TypesAreValidForPmull && InstructionSetIsValidForPmull) {
   2118         Variable *T = makeReg(Ty);
   2119         _movp(T, Src0);
   2120         _pmull(T, Src0 == Src1 ? T : Src1);
   2121         _movp(Dest, T);
   2122       } else if (Ty == IceType_v4i32) {
   2123         // Lowering sequence:
   2124         // Note: The mask arguments have index 0 on the left.
   2125         //
   2126         // movups  T1, Src0
   2127         // pshufd  T2, Src0, {1,0,3,0}
   2128         // pshufd  T3, Src1, {1,0,3,0}
   2129         // # T1 = {Src0[0] * Src1[0], Src0[2] * Src1[2]}
   2130         // pmuludq T1, Src1
   2131         // # T2 = {Src0[1] * Src1[1], Src0[3] * Src1[3]}
   2132         // pmuludq T2, T3
   2133         // # T1 = {lo(T1[0]), lo(T1[2]), lo(T2[0]), lo(T2[2])}
   2134         // shufps  T1, T2, {0,2,0,2}
   2135         // pshufd  T4, T1, {0,2,1,3}
   2136         // movups  Dest, T4
   2137 
   2138         // Mask that directs pshufd to create a vector with entries
   2139         // Src[1, 0, 3, 0]
   2140         constexpr unsigned Constant1030 = 0x31;
   2141         Constant *Mask1030 = Ctx->getConstantInt32(Constant1030);
   2142         // Mask that directs shufps to create a vector with entries
   2143         // Dest[0, 2], Src[0, 2]
   2144         constexpr unsigned Mask0202 = 0x88;
   2145         // Mask that directs pshufd to create a vector with entries
   2146         // Src[0, 2, 1, 3]
   2147         constexpr unsigned Mask0213 = 0xd8;
   2148         Variable *T1 = makeReg(IceType_v4i32);
   2149         Variable *T2 = makeReg(IceType_v4i32);
   2150         Variable *T3 = makeReg(IceType_v4i32);
   2151         Variable *T4 = makeReg(IceType_v4i32);
   2152         _movp(T1, Src0);
   2153         _pshufd(T2, Src0, Mask1030);
   2154         _pshufd(T3, Src1, Mask1030);
   2155         _pmuludq(T1, Src1);
   2156         _pmuludq(T2, T3);
   2157         _shufps(T1, T2, Ctx->getConstantInt32(Mask0202));
   2158         _pshufd(T4, T1, Ctx->getConstantInt32(Mask0213));
   2159         _movp(Dest, T4);
   2160       } else if (Ty == IceType_v16i8) {
   2161         llvm::report_fatal_error("Scalarized operation was expected");
   2162       } else {
   2163         llvm::report_fatal_error("Invalid vector multiply type");
   2164       }
   2165     } break;
   2166     case InstArithmetic::Shl: {
   2167       assert(llvm::isa<Constant>(Src1) && "Non-constant shift not scalarized");
   2168       Variable *T = makeReg(Ty);
   2169       _movp(T, Src0);
   2170       _psll(T, Src1);
   2171       _movp(Dest, T);
   2172     } break;
   2173     case InstArithmetic::Lshr: {
   2174       assert(llvm::isa<Constant>(Src1) && "Non-constant shift not scalarized");
   2175       Variable *T = makeReg(Ty);
   2176       _movp(T, Src0);
   2177       _psrl(T, Src1);
   2178       _movp(Dest, T);
   2179     } break;
   2180     case InstArithmetic::Ashr: {
   2181       assert(llvm::isa<Constant>(Src1) && "Non-constant shift not scalarized");
   2182       Variable *T = makeReg(Ty);
   2183       _movp(T, Src0);
   2184       _psra(T, Src1);
   2185       _movp(Dest, T);
   2186     } break;
   2187     case InstArithmetic::Udiv:
   2188     case InstArithmetic::Urem:
   2189     case InstArithmetic::Sdiv:
   2190     case InstArithmetic::Srem:
   2191       llvm::report_fatal_error("Scalarized operation was expected");
   2192       break;
   2193     case InstArithmetic::Fadd: {
   2194       Variable *T = makeReg(Ty);
   2195       _movp(T, Src0);
   2196       _addps(T, Src1);
   2197       _movp(Dest, T);
   2198     } break;
   2199     case InstArithmetic::Fsub: {
   2200       Variable *T = makeReg(Ty);
   2201       _movp(T, Src0);
   2202       _subps(T, Src1);
   2203       _movp(Dest, T);
   2204     } break;
   2205     case InstArithmetic::Fmul: {
   2206       Variable *T = makeReg(Ty);
   2207       _movp(T, Src0);
   2208       _mulps(T, Src0 == Src1 ? T : Src1);
   2209       _movp(Dest, T);
   2210     } break;
   2211     case InstArithmetic::Fdiv: {
   2212       Variable *T = makeReg(Ty);
   2213       _movp(T, Src0);
   2214       _divps(T, Src1);
   2215       _movp(Dest, T);
   2216     } break;
   2217     case InstArithmetic::Frem:
   2218       llvm::report_fatal_error("Scalarized operation was expected");
   2219       break;
   2220     }
   2221     return;
   2222   }
   2223   Variable *T_edx = nullptr;
   2224   Variable *T = nullptr;
   2225   switch (Instr->getOp()) {
   2226   case InstArithmetic::_num:
   2227     llvm_unreachable("Unknown arithmetic operator");
   2228     break;
   2229   case InstArithmetic::Add: {
   2230     const bool ValidType =
   2231         Ty == IceType_i32 || (Ty == IceType_i64 && Traits::Is64Bit);
   2232     auto *Const = llvm::dyn_cast<Constant>(Instr->getSrc(1));
   2233     const bool ValidKind =
   2234         Const != nullptr && (llvm::isa<ConstantInteger32>(Const) ||
   2235                              llvm::isa<ConstantRelocatable>(Const));
   2236     if (getFlags().getAggressiveLea() && ValidType && ValidKind) {
   2237       auto *Var = legalizeToReg(Src0);
   2238       auto *Mem = Traits::X86OperandMem::create(Func, IceType_void, Var, Const);
   2239       T = makeReg(Ty);
   2240       _lea(T, _sandbox_mem_reference(Mem));
   2241       _mov(Dest, T);
   2242       break;
   2243     }
   2244     _mov(T, Src0);
   2245     _add(T, Src1);
   2246     _mov(Dest, T);
   2247   } break;
   2248   case InstArithmetic::And:
   2249     _mov(T, Src0);
   2250     _and(T, Src1);
   2251     _mov(Dest, T);
   2252     break;
   2253   case InstArithmetic::Or:
   2254     _mov(T, Src0);
   2255     _or(T, Src1);
   2256     _mov(Dest, T);
   2257     break;
   2258   case InstArithmetic::Xor:
   2259     _mov(T, Src0);
   2260     _xor(T, Src1);
   2261     _mov(Dest, T);
   2262     break;
   2263   case InstArithmetic::Sub:
   2264     _mov(T, Src0);
   2265     _sub(T, Src1);
   2266     _mov(Dest, T);
   2267     break;
   2268   case InstArithmetic::Mul:
   2269     if (auto *C = llvm::dyn_cast<ConstantInteger32>(Src1)) {
   2270       if (optimizeScalarMul(Dest, Src0, C->getValue()))
   2271         return;
   2272     }
   2273     // The 8-bit version of imul only allows the form "imul r/m8" where T must
   2274     // be in al.
   2275     if (isByteSizedArithType(Ty)) {
   2276       _mov(T, Src0, Traits::RegisterSet::Reg_al);
   2277       Src1 = legalize(Src1, Legal_Reg | Legal_Mem);
   2278       _imul(T, Src0 == Src1 ? T : Src1);
   2279       _mov(Dest, T);
   2280     } else if (auto *ImmConst = llvm::dyn_cast<ConstantInteger32>(Src1)) {
   2281       T = makeReg(Ty);
   2282       _imul_imm(T, Src0, ImmConst);
   2283       _mov(Dest, T);
   2284     } else {
   2285       _mov(T, Src0);
   2286       _imul(T, Src0 == Src1 ? T : Src1);
   2287       _mov(Dest, T);
   2288     }
   2289     break;
   2290   case InstArithmetic::Shl:
   2291     _mov(T, Src0);
   2292     if (!llvm::isa<ConstantInteger32>(Src1) &&
   2293         !llvm::isa<ConstantInteger64>(Src1))
   2294       Src1 = copyToReg8(Src1, Traits::RegisterSet::Reg_cl);
   2295     _shl(T, Src1);
   2296     _mov(Dest, T);
   2297     break;
   2298   case InstArithmetic::Lshr:
   2299     _mov(T, Src0);
   2300     if (!llvm::isa<ConstantInteger32>(Src1) &&
   2301         !llvm::isa<ConstantInteger64>(Src1))
   2302       Src1 = copyToReg8(Src1, Traits::RegisterSet::Reg_cl);
   2303     _shr(T, Src1);
   2304     _mov(Dest, T);
   2305     break;
   2306   case InstArithmetic::Ashr:
   2307     _mov(T, Src0);
   2308     if (!llvm::isa<ConstantInteger32>(Src1) &&
   2309         !llvm::isa<ConstantInteger64>(Src1))
   2310       Src1 = copyToReg8(Src1, Traits::RegisterSet::Reg_cl);
   2311     _sar(T, Src1);
   2312     _mov(Dest, T);
   2313     break;
   2314   case InstArithmetic::Udiv: {
   2315     // div and idiv are the few arithmetic operators that do not allow
   2316     // immediates as the operand.
   2317     Src1 = legalize(Src1, Legal_Reg | Legal_Mem);
   2318     RegNumT Eax;
   2319     RegNumT Edx;
   2320     switch (Ty) {
   2321     default:
   2322       llvm::report_fatal_error("Bad type for udiv");
   2323     case IceType_i64:
   2324       Eax = Traits::getRaxOrDie();
   2325       Edx = Traits::getRdxOrDie();
   2326       break;
   2327     case IceType_i32:
   2328       Eax = Traits::RegisterSet::Reg_eax;
   2329       Edx = Traits::RegisterSet::Reg_edx;
   2330       break;
   2331     case IceType_i16:
   2332       Eax = Traits::RegisterSet::Reg_ax;
   2333       Edx = Traits::RegisterSet::Reg_dx;
   2334       break;
   2335     case IceType_i8:
   2336       Eax = Traits::RegisterSet::Reg_al;
   2337       Edx = Traits::RegisterSet::Reg_ah;
   2338       break;
   2339     }
   2340     T_edx = makeReg(Ty, Edx);
   2341     _mov(T, Src0, Eax);
   2342     _mov(T_edx, Ctx->getConstantZero(Ty));
   2343     _div(T_edx, Src1, T);
   2344     _redefined(Context.insert<InstFakeDef>(T, T_edx));
   2345     _mov(Dest, T);
   2346   } break;
   2347   case InstArithmetic::Sdiv:
   2348     // TODO(stichnot): Enable this after doing better performance and cross
   2349     // testing.
   2350     if (false && Func->getOptLevel() >= Opt_1) {
   2351       // Optimize division by constant power of 2, but not for Om1 or O0, just
   2352       // to keep things simple there.
   2353       if (auto *C = llvm::dyn_cast<ConstantInteger32>(Src1)) {
   2354         const int32_t Divisor = C->getValue();
   2355         const uint32_t UDivisor = Divisor;
   2356         if (Divisor > 0 && llvm::isPowerOf2_32(UDivisor)) {
   2357           uint32_t LogDiv = llvm::Log2_32(UDivisor);
   2358           // LLVM does the following for dest=src/(1<<log):
   2359           //   t=src
   2360           //   sar t,typewidth-1 // -1 if src is negative, 0 if not
   2361           //   shr t,typewidth-log
   2362           //   add t,src
   2363           //   sar t,log
   2364           //   dest=t
   2365           uint32_t TypeWidth = Traits::X86_CHAR_BIT * typeWidthInBytes(Ty);
   2366           _mov(T, Src0);
   2367           // If for some reason we are dividing by 1, just treat it like an
   2368           // assignment.
   2369           if (LogDiv > 0) {
   2370             // The initial sar is unnecessary when dividing by 2.
   2371             if (LogDiv > 1)
   2372               _sar(T, Ctx->getConstantInt(Ty, TypeWidth - 1));
   2373             _shr(T, Ctx->getConstantInt(Ty, TypeWidth - LogDiv));
   2374             _add(T, Src0);
   2375             _sar(T, Ctx->getConstantInt(Ty, LogDiv));
   2376           }
   2377           _mov(Dest, T);
   2378           return;
   2379         }
   2380       }
   2381     }
   2382     Src1 = legalize(Src1, Legal_Reg | Legal_Mem);
   2383     switch (Ty) {
   2384     default:
   2385       llvm::report_fatal_error("Bad type for sdiv");
   2386     case IceType_i64:
   2387       T_edx = makeReg(Ty, Traits::getRdxOrDie());
   2388       _mov(T, Src0, Traits::getRaxOrDie());
   2389       break;
   2390     case IceType_i32:
   2391       T_edx = makeReg(Ty, Traits::RegisterSet::Reg_edx);
   2392       _mov(T, Src0, Traits::RegisterSet::Reg_eax);
   2393       break;
   2394     case IceType_i16:
   2395       T_edx = makeReg(Ty, Traits::RegisterSet::Reg_dx);
   2396       _mov(T, Src0, Traits::RegisterSet::Reg_ax);
   2397       break;
   2398     case IceType_i8:
   2399       T_edx = makeReg(IceType_i16, Traits::RegisterSet::Reg_ax);
   2400       _mov(T, Src0, Traits::RegisterSet::Reg_al);
   2401       break;
   2402     }
   2403     _cbwdq(T_edx, T);
   2404     _idiv(T_edx, Src1, T);
   2405     _redefined(Context.insert<InstFakeDef>(T, T_edx));
   2406     _mov(Dest, T);
   2407     break;
   2408   case InstArithmetic::Urem: {
   2409     Src1 = legalize(Src1, Legal_Reg | Legal_Mem);
   2410     RegNumT Eax;
   2411     RegNumT Edx;
   2412     switch (Ty) {
   2413     default:
   2414       llvm::report_fatal_error("Bad type for urem");
   2415     case IceType_i64:
   2416       Eax = Traits::getRaxOrDie();
   2417       Edx = Traits::getRdxOrDie();
   2418       break;
   2419     case IceType_i32:
   2420       Eax = Traits::RegisterSet::Reg_eax;
   2421       Edx = Traits::RegisterSet::Reg_edx;
   2422       break;
   2423     case IceType_i16:
   2424       Eax = Traits::RegisterSet::Reg_ax;
   2425       Edx = Traits::RegisterSet::Reg_dx;
   2426       break;
   2427     case IceType_i8:
   2428       Eax = Traits::RegisterSet::Reg_al;
   2429       Edx = Traits::RegisterSet::Reg_ah;
   2430       break;
   2431     }
   2432     T_edx = makeReg(Ty, Edx);
   2433     _mov(T_edx, Ctx->getConstantZero(Ty));
   2434     _mov(T, Src0, Eax);
   2435     _div(T, Src1, T_edx);
   2436     _redefined(Context.insert<InstFakeDef>(T_edx, T));
   2437     if (Ty == IceType_i8) {
   2438       // Register ah must be moved into one of {al,bl,cl,dl} before it can be
   2439       // moved into a general 8-bit register.
   2440       auto *T_AhRcvr = makeReg(Ty);
   2441       T_AhRcvr->setRegClass(RCX86_IsAhRcvr);
   2442       _mov(T_AhRcvr, T_edx);
   2443       T_edx = T_AhRcvr;
   2444     }
   2445     _mov(Dest, T_edx);
   2446   } break;
   2447   case InstArithmetic::Srem: {
   2448     // TODO(stichnot): Enable this after doing better performance and cross
   2449     // testing.
   2450     if (false && Func->getOptLevel() >= Opt_1) {
   2451       // Optimize mod by constant power of 2, but not for Om1 or O0, just to
   2452       // keep things simple there.
   2453       if (auto *C = llvm::dyn_cast<ConstantInteger32>(Src1)) {
   2454         const int32_t Divisor = C->getValue();
   2455         const uint32_t UDivisor = Divisor;
   2456         if (Divisor > 0 && llvm::isPowerOf2_32(UDivisor)) {
   2457           uint32_t LogDiv = llvm::Log2_32(UDivisor);
   2458           // LLVM does the following for dest=src%(1<<log):
   2459           //   t=src
   2460           //   sar t,typewidth-1 // -1 if src is negative, 0 if not
   2461           //   shr t,typewidth-log
   2462           //   add t,src
   2463           //   and t, -(1<<log)
   2464           //   sub t,src
   2465           //   neg t
   2466           //   dest=t
   2467           uint32_t TypeWidth = Traits::X86_CHAR_BIT * typeWidthInBytes(Ty);
   2468           // If for some reason we are dividing by 1, just assign 0.
   2469           if (LogDiv == 0) {
   2470             _mov(Dest, Ctx->getConstantZero(Ty));
   2471             return;
   2472           }
   2473           _mov(T, Src0);
   2474           // The initial sar is unnecessary when dividing by 2.
   2475           if (LogDiv > 1)
   2476             _sar(T, Ctx->getConstantInt(Ty, TypeWidth - 1));
   2477           _shr(T, Ctx->getConstantInt(Ty, TypeWidth - LogDiv));
   2478           _add(T, Src0);
   2479           _and(T, Ctx->getConstantInt(Ty, -(1 << LogDiv)));
   2480           _sub(T, Src0);
   2481           _neg(T);
   2482           _mov(Dest, T);
   2483           return;
   2484         }
   2485       }
   2486     }
   2487     Src1 = legalize(Src1, Legal_Reg | Legal_Mem);
   2488     RegNumT Eax;
   2489     RegNumT Edx;
   2490     switch (Ty) {
   2491     default:
   2492       llvm::report_fatal_error("Bad type for srem");
   2493     case IceType_i64:
   2494       Eax = Traits::getRaxOrDie();
   2495       Edx = Traits::getRdxOrDie();
   2496       break;
   2497     case IceType_i32:
   2498       Eax = Traits::RegisterSet::Reg_eax;
   2499       Edx = Traits::RegisterSet::Reg_edx;
   2500       break;
   2501     case IceType_i16:
   2502       Eax = Traits::RegisterSet::Reg_ax;
   2503       Edx = Traits::RegisterSet::Reg_dx;
   2504       break;
   2505     case IceType_i8:
   2506       Eax = Traits::RegisterSet::Reg_al;
   2507       Edx = Traits::RegisterSet::Reg_ah;
   2508       break;
   2509     }
   2510     T_edx = makeReg(Ty, Edx);
   2511     _mov(T, Src0, Eax);
   2512     _cbwdq(T_edx, T);
   2513     _idiv(T, Src1, T_edx);
   2514     _redefined(Context.insert<InstFakeDef>(T_edx, T));
   2515     if (Ty == IceType_i8) {
   2516       // Register ah must be moved into one of {al,bl,cl,dl} before it can be
   2517       // moved into a general 8-bit register.
   2518       auto *T_AhRcvr = makeReg(Ty);
   2519       T_AhRcvr->setRegClass(RCX86_IsAhRcvr);
   2520       _mov(T_AhRcvr, T_edx);
   2521       T_edx = T_AhRcvr;
   2522     }
   2523     _mov(Dest, T_edx);
   2524   } break;
   2525   case InstArithmetic::Fadd:
   2526     _mov(T, Src0);
   2527     _addss(T, Src1);
   2528     _mov(Dest, T);
   2529     break;
   2530   case InstArithmetic::Fsub:
   2531     _mov(T, Src0);
   2532     _subss(T, Src1);
   2533     _mov(Dest, T);
   2534     break;
   2535   case InstArithmetic::Fmul:
   2536     _mov(T, Src0);
   2537     _mulss(T, Src0 == Src1 ? T : Src1);
   2538     _mov(Dest, T);
   2539     break;
   2540   case InstArithmetic::Fdiv:
   2541     _mov(T, Src0);
   2542     _divss(T, Src1);
   2543     _mov(Dest, T);
   2544     break;
   2545   case InstArithmetic::Frem:
   2546     llvm::report_fatal_error("Helper call was expected");
   2547     break;
   2548   }
   2549 }
   2550 
   2551 template <typename TraitsType>
   2552 void TargetX86Base<TraitsType>::lowerAssign(const InstAssign *Instr) {
   2553   Variable *Dest = Instr->getDest();
   2554   if (Dest->isRematerializable()) {
   2555     Context.insert<InstFakeDef>(Dest);
   2556     return;
   2557   }
   2558   Operand *Src = Instr->getSrc(0);
   2559   assert(Dest->getType() == Src->getType());
   2560   lowerMove(Dest, Src, false);
   2561 }
   2562 
   2563 template <typename TraitsType>
   2564 void TargetX86Base<TraitsType>::lowerBr(const InstBr *Br) {
   2565   if (Br->isUnconditional()) {
   2566     _br(Br->getTargetUnconditional());
   2567     return;
   2568   }
   2569   Operand *Cond = Br->getCondition();
   2570 
   2571   // Handle folding opportunities.
   2572   if (const Inst *Producer = FoldingInfo.getProducerFor(Cond)) {
   2573     assert(Producer->isDeleted());
   2574     switch (BoolFolding<Traits>::getProducerKind(Producer)) {
   2575     default:
   2576       break;
   2577     case BoolFolding<Traits>::PK_Icmp32:
   2578     case BoolFolding<Traits>::PK_Icmp64: {
   2579       lowerIcmpAndConsumer(llvm::cast<InstIcmp>(Producer), Br);
   2580       return;
   2581     }
   2582     case BoolFolding<Traits>::PK_Fcmp: {
   2583       lowerFcmpAndConsumer(llvm::cast<InstFcmp>(Producer), Br);
   2584       return;
   2585     }
   2586     case BoolFolding<Traits>::PK_Arith: {
   2587       lowerArithAndConsumer(llvm::cast<InstArithmetic>(Producer), Br);
   2588       return;
   2589     }
   2590     }
   2591   }
   2592   Operand *Src0 = legalize(Cond, Legal_Reg | Legal_Mem);
   2593   Constant *Zero = Ctx->getConstantZero(IceType_i32);
   2594   _cmp(Src0, Zero);
   2595   _br(Traits::Cond::Br_ne, Br->getTargetTrue(), Br->getTargetFalse());
   2596 }
   2597 
   2598 // constexprMax returns a (constexpr) max(S0, S1), and it is used for defining
   2599 // OperandList in lowerCall. std::max() is supposed to work, but it doesn't.
   2600 inline constexpr SizeT constexprMax(SizeT S0, SizeT S1) {
   2601   return S0 < S1 ? S1 : S0;
   2602 }
   2603 
   2604 template <typename TraitsType>
   2605 void TargetX86Base<TraitsType>::lowerCall(const InstCall *Instr) {
   2606   // Common x86 calling convention lowering:
   2607   //
   2608   // * At the point before the call, the stack must be aligned to 16 bytes.
   2609   //
   2610   // * Non-register arguments are pushed onto the stack in right-to-left order,
   2611   // such that the left-most argument ends up on the top of the stack at the
   2612   // lowest memory address.
   2613   //
   2614   // * Stack arguments of vector type are aligned to start at the next highest
   2615   // multiple of 16 bytes. Other stack arguments are aligned to the next word
   2616   // size boundary (4 or 8 bytes, respectively).
   2617   RequiredStackAlignment = std::max<size_t>(RequiredStackAlignment,
   2618                                             Traits::X86_STACK_ALIGNMENT_BYTES);
   2619 
   2620   using OperandList =
   2621       llvm::SmallVector<Operand *, constexprMax(Traits::X86_MAX_XMM_ARGS,
   2622                                                 Traits::X86_MAX_GPR_ARGS)>;
   2623   OperandList XmmArgs;
   2624   CfgVector<std::pair<const Type, Operand *>> GprArgs;
   2625   OperandList StackArgs, StackArgLocations;
   2626   uint32_t ParameterAreaSizeBytes = 0;
   2627 
   2628   // Classify each argument operand according to the location where the argument
   2629   // is passed.
   2630   for (SizeT i = 0, NumArgs = Instr->getNumArgs(); i < NumArgs; ++i) {
   2631     Operand *Arg = Instr->getArg(i);
   2632     const Type Ty = Arg->getType();
   2633     // The PNaCl ABI requires the width of arguments to be at least 32 bits.
   2634     assert(typeWidthInBytes(Ty) >= 4);
   2635     if (isVectorType(Ty) &&
   2636         Traits::getRegisterForXmmArgNum(XmmArgs.size()).hasValue()) {
   2637       XmmArgs.push_back(Arg);
   2638     } else if (isScalarFloatingType(Ty) && Traits::X86_PASS_SCALAR_FP_IN_XMM &&
   2639                Traits::getRegisterForXmmArgNum(XmmArgs.size()).hasValue()) {
   2640       XmmArgs.push_back(Arg);
   2641     } else if (isScalarIntegerType(Ty) &&
   2642                Traits::getRegisterForGprArgNum(Ty, GprArgs.size()).hasValue()) {
   2643       GprArgs.emplace_back(Ty, Arg);
   2644     } else {
   2645       // Place on stack.
   2646       StackArgs.push_back(Arg);
   2647       if (isVectorType(Arg->getType())) {
   2648         ParameterAreaSizeBytes =
   2649             Traits::applyStackAlignment(ParameterAreaSizeBytes);
   2650       }
   2651       Variable *esp = getPhysicalRegister(getStackReg(), Traits::WordType);
   2652       Constant *Loc = Ctx->getConstantInt32(ParameterAreaSizeBytes);
   2653       StackArgLocations.push_back(
   2654           Traits::X86OperandMem::create(Func, Ty, esp, Loc));
   2655       ParameterAreaSizeBytes += typeWidthInBytesOnStack(Arg->getType());
   2656     }
   2657   }
   2658   // Ensure there is enough space for the fstp/movs for floating returns.
   2659   Variable *Dest = Instr->getDest();
   2660   const Type DestTy = Dest ? Dest->getType() : IceType_void;
   2661   if (!Traits::X86_PASS_SCALAR_FP_IN_XMM) {
   2662     if (isScalarFloatingType(DestTy)) {
   2663       ParameterAreaSizeBytes =
   2664           std::max(static_cast<size_t>(ParameterAreaSizeBytes),
   2665                    typeWidthInBytesOnStack(DestTy));
   2666     }
   2667   }
   2668   // Adjust the parameter area so that the stack is aligned. It is assumed that
   2669   // the stack is already aligned at the start of the calling sequence.
   2670   ParameterAreaSizeBytes = Traits::applyStackAlignment(ParameterAreaSizeBytes);
   2671   assert(ParameterAreaSizeBytes <= maxOutArgsSizeBytes());
   2672   // Copy arguments that are passed on the stack to the appropriate stack
   2673   // locations.  We make sure legalize() is called on each argument at this
   2674   // point, to allow availabilityGet() to work.
   2675   for (SizeT i = 0, NumStackArgs = StackArgs.size(); i < NumStackArgs; ++i) {
   2676     lowerStore(
   2677         InstStore::create(Func, legalize(StackArgs[i]), StackArgLocations[i]));
   2678   }
   2679   // Copy arguments to be passed in registers to the appropriate registers.
   2680   for (SizeT i = 0, NumXmmArgs = XmmArgs.size(); i < NumXmmArgs; ++i) {
   2681     XmmArgs[i] =
   2682         legalizeToReg(legalize(XmmArgs[i]), Traits::getRegisterForXmmArgNum(i));
   2683   }
   2684   // Materialize moves for arguments passed in GPRs.
   2685   for (SizeT i = 0, NumGprArgs = GprArgs.size(); i < NumGprArgs; ++i) {
   2686     const Type SignatureTy = GprArgs[i].first;
   2687     Operand *Arg =
   2688         legalize(GprArgs[i].second, Legal_Default | Legal_Rematerializable);
   2689     GprArgs[i].second =
   2690         legalizeToReg(Arg, Traits::getRegisterForGprArgNum(Arg->getType(), i));
   2691     assert(SignatureTy == IceType_i64 || SignatureTy == IceType_i32);
   2692     assert(SignatureTy == Arg->getType());
   2693     (void)SignatureTy;
   2694   }
   2695   // Generate a FakeUse of register arguments so that they do not get dead code
   2696   // eliminated as a result of the FakeKill of scratch registers after the call.
   2697   // These need to be right before the call instruction.
   2698   for (auto *Arg : XmmArgs) {
   2699     Context.insert<InstFakeUse>(llvm::cast<Variable>(Arg));
   2700   }
   2701   for (auto &ArgPair : GprArgs) {
   2702     Context.insert<InstFakeUse>(llvm::cast<Variable>(ArgPair.second));
   2703   }
   2704   // Generate the call instruction. Assign its result to a temporary with high
   2705   // register allocation weight.
   2706   // ReturnReg doubles as ReturnRegLo as necessary.
   2707   Variable *ReturnReg = nullptr;
   2708   Variable *ReturnRegHi = nullptr;
   2709   if (Dest) {
   2710     switch (DestTy) {
   2711     case IceType_NUM:
   2712     case IceType_void:
   2713     case IceType_i1:
   2714     case IceType_i8:
   2715     case IceType_i16:
   2716       llvm::report_fatal_error("Invalid Call dest type");
   2717       break;
   2718     case IceType_i32:
   2719       ReturnReg = makeReg(DestTy, Traits::RegisterSet::Reg_eax);
   2720       break;
   2721     case IceType_i64:
   2722       if (Traits::Is64Bit) {
   2723         ReturnReg = makeReg(IceType_i64, Traits::getRaxOrDie());
   2724       } else {
   2725         ReturnReg = makeReg(IceType_i32, Traits::RegisterSet::Reg_eax);
   2726         ReturnRegHi = makeReg(IceType_i32, Traits::RegisterSet::Reg_edx);
   2727       }
   2728       break;
   2729     case IceType_f32:
   2730     case IceType_f64:
   2731       if (!Traits::X86_PASS_SCALAR_FP_IN_XMM) {
   2732         // Leave ReturnReg==ReturnRegHi==nullptr, and capture the result with
   2733         // the fstp instruction.
   2734         break;
   2735       }
   2736     // Fallthrough intended.
   2737     case IceType_v4i1:
   2738     case IceType_v8i1:
   2739     case IceType_v16i1:
   2740     case IceType_v16i8:
   2741     case IceType_v8i16:
   2742     case IceType_v4i32:
   2743     case IceType_v4f32:
   2744       ReturnReg = makeReg(DestTy, Traits::RegisterSet::Reg_xmm0);
   2745       break;
   2746     }
   2747   }
   2748   // Emit the call to the function.
   2749   Operand *CallTarget =
   2750       legalize(Instr->getCallTarget(), Legal_Reg | Legal_Imm | Legal_AddrAbs);
   2751   Inst *NewCall = emitCallToTarget(CallTarget, ReturnReg);
   2752   // Keep the upper return register live on 32-bit platform.
   2753   if (ReturnRegHi)
   2754     Context.insert<InstFakeDef>(ReturnRegHi);
   2755   // Mark the call as killing all the caller-save registers.
   2756   Context.insert<InstFakeKill>(NewCall);
   2757   // Handle x86-32 floating point returns.
   2758   if (Dest != nullptr && isScalarFloatingType(DestTy) &&
   2759       !Traits::X86_PASS_SCALAR_FP_IN_XMM) {
   2760     // Special treatment for an FP function which returns its result in st(0).
   2761     // If Dest ends up being a physical xmm register, the fstp emit code will
   2762     // route st(0) through the space reserved in the function argument area
   2763     // we allocated.
   2764     _fstp(Dest);
   2765     // Create a fake use of Dest in case it actually isn't used, because st(0)
   2766     // still needs to be popped.
   2767     Context.insert<InstFakeUse>(Dest);
   2768   }
   2769   // Generate a FakeUse to keep the call live if necessary.
   2770   if (Instr->hasSideEffects() && ReturnReg) {
   2771     Context.insert<InstFakeUse>(ReturnReg);
   2772   }
   2773   // Process the return value, if any.
   2774   if (Dest == nullptr)
   2775     return;
   2776   // Assign the result of the call to Dest.  Route it through a temporary so
   2777   // that the local register availability peephole can be subsequently used.
   2778   Variable *Tmp = nullptr;
   2779   if (isVectorType(DestTy)) {
   2780     assert(ReturnReg && "Vector type requires a return register");
   2781     Tmp = makeReg(DestTy);
   2782     _movp(Tmp, ReturnReg);
   2783     _movp(Dest, Tmp);
   2784   } else if (isScalarFloatingType(DestTy)) {
   2785     if (Traits::X86_PASS_SCALAR_FP_IN_XMM) {
   2786       assert(ReturnReg && "FP type requires a return register");
   2787       _mov(Tmp, ReturnReg);
   2788       _mov(Dest, Tmp);
   2789     }
   2790   } else {
   2791     assert(isScalarIntegerType(DestTy));
   2792     assert(ReturnReg && "Integer type requires a return register");
   2793     if (DestTy == IceType_i64 && !Traits::Is64Bit) {
   2794       assert(ReturnRegHi && "64-bit type requires two return registers");
   2795       auto *Dest64On32 = llvm::cast<Variable64On32>(Dest);
   2796       Variable *DestLo = Dest64On32->getLo();
   2797       Variable *DestHi = Dest64On32->getHi();
   2798       _mov(Tmp, ReturnReg);
   2799       _mov(DestLo, Tmp);
   2800       Variable *TmpHi = nullptr;
   2801       _mov(TmpHi, ReturnRegHi);
   2802       _mov(DestHi, TmpHi);
   2803     } else {
   2804       _mov(Tmp, ReturnReg);
   2805       _mov(Dest, Tmp);
   2806     }
   2807   }
   2808 }
   2809 
   2810 template <typename TraitsType>
   2811 void TargetX86Base<TraitsType>::lowerCast(const InstCast *Instr) {
   2812   // a = cast(b) ==> t=cast(b); a=t; (link t->b, link a->t, no overlap)
   2813   InstCast::OpKind CastKind = Instr->getCastKind();
   2814   Variable *Dest = Instr->getDest();
   2815   Type DestTy = Dest->getType();
   2816   switch (CastKind) {
   2817   default:
   2818     Func->setError("Cast type not supported");
   2819     return;
   2820   case InstCast::Sext: {
   2821     // Src0RM is the source operand legalized to physical register or memory,
   2822     // but not immediate, since the relevant x86 native instructions don't
   2823     // allow an immediate operand. If the operand is an immediate, we could
   2824     // consider computing the strength-reduced result at translation time, but
   2825     // we're unlikely to see something like that in the bitcode that the
   2826     // optimizer wouldn't have already taken care of.
   2827     Operand *Src0RM = legalize(Instr->getSrc(0), Legal_Reg | Legal_Mem);
   2828     if (isVectorType(DestTy)) {
   2829       if (DestTy == IceType_v16i8) {
   2830         // onemask = materialize(1,1,...); dst = (src & onemask) > 0
   2831         Variable *OneMask = makeVectorOfOnes(DestTy);
   2832         Variable *T = makeReg(DestTy);
   2833         _movp(T, Src0RM);
   2834         _pand(T, OneMask);
   2835         Variable *Zeros = makeVectorOfZeros(DestTy);
   2836         _pcmpgt(T, Zeros);
   2837         _movp(Dest, T);
   2838       } else {
   2839         /// width = width(elty) - 1; dest = (src << width) >> width
   2840         SizeT ShiftAmount =
   2841             Traits::X86_CHAR_BIT * typeWidthInBytes(typeElementType(DestTy)) -
   2842             1;
   2843         Constant *ShiftConstant = Ctx->getConstantInt8(ShiftAmount);
   2844         Variable *T = makeReg(DestTy);
   2845         _movp(T, Src0RM);
   2846         _psll(T, ShiftConstant);
   2847         _psra(T, ShiftConstant);
   2848         _movp(Dest, T);
   2849       }
   2850     } else if (!Traits::Is64Bit && DestTy == IceType_i64) {
   2851       // t1=movsx src; t2=t1; t2=sar t2, 31; dst.lo=t1; dst.hi=t2
   2852       Constant *Shift = Ctx->getConstantInt32(31);
   2853       auto *DestLo = llvm::cast<Variable>(loOperand(Dest));
   2854       auto *DestHi = llvm::cast<Variable>(hiOperand(Dest));
   2855       Variable *T_Lo = makeReg(DestLo->getType());
   2856       if (Src0RM->getType() == IceType_i32) {
   2857         _mov(T_Lo, Src0RM);
   2858       } else if (Src0RM->getType() == IceType_i1) {
   2859         _movzx(T_Lo, Src0RM);
   2860         _shl(T_Lo, Shift);
   2861         _sar(T_Lo, Shift);
   2862       } else {
   2863         _movsx(T_Lo, Src0RM);
   2864       }
   2865       _mov(DestLo, T_Lo);
   2866       Variable *T_Hi = nullptr;
   2867       _mov(T_Hi, T_Lo);
   2868       if (Src0RM->getType() != IceType_i1)
   2869         // For i1, the sar instruction is already done above.
   2870         _sar(T_Hi, Shift);
   2871       _mov(DestHi, T_Hi);
   2872     } else if (Src0RM->getType() == IceType_i1) {
   2873       // t1 = src
   2874       // shl t1, dst_bitwidth - 1
   2875       // sar t1, dst_bitwidth - 1
   2876       // dst = t1
   2877       size_t DestBits = Traits::X86_CHAR_BIT * typeWidthInBytes(DestTy);
   2878       Constant *ShiftAmount = Ctx->getConstantInt32(DestBits - 1);
   2879       Variable *T = makeReg(DestTy);
   2880       if (typeWidthInBytes(DestTy) <= typeWidthInBytes(Src0RM->getType())) {
   2881         _mov(T, Src0RM);
   2882       } else {
   2883         // Widen the source using movsx or movzx. (It doesn't matter which one,
   2884         // since the following shl/sar overwrite the bits.)
   2885         _movzx(T, Src0RM);
   2886       }
   2887       _shl(T, ShiftAmount);
   2888       _sar(T, ShiftAmount);
   2889       _mov(Dest, T);
   2890     } else {
   2891       // t1 = movsx src; dst = t1
   2892       Variable *T = makeReg(DestTy);
   2893       _movsx(T, Src0RM);
   2894       _mov(Dest, T);
   2895     }
   2896     break;
   2897   }
   2898   case InstCast::Zext: {
   2899     Operand *Src0RM = legalize(Instr->getSrc(0), Legal_Reg | Legal_Mem);
   2900     if (isVectorType(DestTy)) {
   2901       // onemask = materialize(1,1,...); dest = onemask & src
   2902       Variable *OneMask = makeVectorOfOnes(DestTy);
   2903       Variable *T = makeReg(DestTy);
   2904       _movp(T, Src0RM);
   2905       _pand(T, OneMask);
   2906       _movp(Dest, T);
   2907     } else if (!Traits::Is64Bit && DestTy == IceType_i64) {
   2908       // t1=movzx src; dst.lo=t1; dst.hi=0
   2909       Constant *Zero = Ctx->getConstantZero(IceType_i32);
   2910       auto *DestLo = llvm::cast<Variable>(loOperand(Dest));
   2911       auto *DestHi = llvm::cast<Variable>(hiOperand(Dest));
   2912       Variable *Tmp = makeReg(DestLo->getType());
   2913       if (Src0RM->getType() == IceType_i32) {
   2914         _mov(Tmp, Src0RM);
   2915       } else {
   2916         _movzx(Tmp, Src0RM);
   2917       }
   2918       _mov(DestLo, Tmp);
   2919       _mov(DestHi, Zero);
   2920     } else if (Src0RM->getType() == IceType_i1) {
   2921       // t = Src0RM; Dest = t
   2922       Variable *T = nullptr;
   2923       if (DestTy == IceType_i8) {
   2924         _mov(T, Src0RM);
   2925       } else {
   2926         assert(DestTy != IceType_i1);
   2927         assert(Traits::Is64Bit || DestTy != IceType_i64);
   2928         // Use 32-bit for both 16-bit and 32-bit, since 32-bit ops are shorter.
   2929         // In x86-64 we need to widen T to 64-bits to ensure that T -- if
   2930         // written to the stack (i.e., in -Om1) will be fully zero-extended.
   2931         T = makeReg(DestTy == IceType_i64 ? IceType_i64 : IceType_i32);
   2932         _movzx(T, Src0RM);
   2933       }
   2934       _mov(Dest, T);
   2935     } else {
   2936       // t1 = movzx src; dst = t1
   2937       Variable *T = makeReg(DestTy);
   2938       _movzx(T, Src0RM);
   2939       _mov(Dest, T);
   2940     }
   2941     break;
   2942   }
   2943   case InstCast::Trunc: {
   2944     if (isVectorType(DestTy)) {
   2945       // onemask = materialize(1,1,...); dst = src & onemask
   2946       Operand *Src0RM = legalize(Instr->getSrc(0), Legal_Reg | Legal_Mem);
   2947       Type Src0Ty = Src0RM->getType();
   2948       Variable *OneMask = makeVectorOfOnes(Src0Ty);
   2949       Variable *T = makeReg(DestTy);
   2950       _movp(T, Src0RM);
   2951       _pand(T, OneMask);
   2952       _movp(Dest, T);
   2953     } else if (DestTy == IceType_i1 || DestTy == IceType_i8) {
   2954       // Make sure we truncate from and into valid registers.
   2955       Operand *Src0 = legalizeUndef(Instr->getSrc(0));
   2956       if (!Traits::Is64Bit && Src0->getType() == IceType_i64)
   2957         Src0 = loOperand(Src0);
   2958       Operand *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
   2959       Variable *T = copyToReg8(Src0RM);
   2960       if (DestTy == IceType_i1)
   2961         _and(T, Ctx->getConstantInt1(1));
   2962       _mov(Dest, T);
   2963     } else {
   2964       Operand *Src0 = legalizeUndef(Instr->getSrc(0));
   2965       if (!Traits::Is64Bit && Src0->getType() == IceType_i64)
   2966         Src0 = loOperand(Src0);
   2967       Operand *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
   2968       // t1 = trunc Src0RM; Dest = t1
   2969       Variable *T = makeReg(DestTy);
   2970       _mov(T, Src0RM);
   2971       _mov(Dest, T);
   2972     }
   2973     break;
   2974   }
   2975   case InstCast::Fptrunc:
   2976   case InstCast::Fpext: {
   2977     Operand *Src0RM = legalize(Instr->getSrc(0), Legal_Reg | Legal_Mem);
   2978     // t1 = cvt Src0RM; Dest = t1
   2979     Variable *T = makeReg(DestTy);
   2980     _cvt(T, Src0RM, Traits::Insts::Cvt::Float2float);
   2981     _mov(Dest, T);
   2982     break;
   2983   }
   2984   case InstCast::Fptosi:
   2985     if (isVectorType(DestTy)) {
   2986       assert(DestTy == IceType_v4i32);
   2987       assert(Instr->getSrc(0)->getType() == IceType_v4f32);
   2988       Operand *Src0R = legalizeToReg(Instr->getSrc(0));
   2989       Variable *T = makeReg(DestTy);
   2990       _cvt(T, Src0R, Traits::Insts::Cvt::Tps2dq);
   2991       _movp(Dest, T);
   2992     } else if (!Traits::Is64Bit && DestTy == IceType_i64) {
   2993       llvm::report_fatal_error("Helper call was expected");
   2994     } else {
   2995       Operand *Src0RM = legalize(Instr->getSrc(0), Legal_Reg | Legal_Mem);
   2996       // t1.i32 = cvt Src0RM; t2.dest_type = t1; Dest = t2.dest_type
   2997       Variable *T_1 = nullptr;
   2998       if (Traits::Is64Bit && DestTy == IceType_i64) {
   2999         T_1 = makeReg(IceType_i64);
   3000       } else {
   3001         assert(DestTy != IceType_i64);
   3002         T_1 = makeReg(IceType_i32);
   3003       }
   3004       // cvt() requires its integer argument to be a GPR.
   3005       Variable *T_2 = makeReg(DestTy);
   3006       if (isByteSizedType(DestTy)) {
   3007         assert(T_1->getType() == IceType_i32);
   3008         T_1->setRegClass(RCX86_Is32To8);
   3009         T_2->setRegClass(RCX86_IsTrunc8Rcvr);
   3010       }
   3011       _cvt(T_1, Src0RM, Traits::Insts::Cvt::Tss2si);
   3012       _mov(T_2, T_1); // T_1 and T_2 may have different integer types
   3013       if (DestTy == IceType_i1)
   3014         _and(T_2, Ctx->getConstantInt1(1));
   3015       _mov(Dest, T_2);
   3016     }
   3017     break;
   3018   case InstCast::Fptoui:
   3019     if (isVectorType(DestTy)) {
   3020       llvm::report_fatal_error("Helper call was expected");
   3021     } else if (DestTy == IceType_i64 ||
   3022                (!Traits::Is64Bit && DestTy == IceType_i32)) {
   3023       llvm::report_fatal_error("Helper call was expected");
   3024     } else {
   3025       Operand *Src0RM = legalize(Instr->getSrc(0), Legal_Reg | Legal_Mem);
   3026       // t1.i32 = cvt Src0RM; t2.dest_type = t1; Dest = t2.dest_type
   3027       assert(DestTy != IceType_i64);
   3028       Variable *T_1 = nullptr;
   3029       if (Traits::Is64Bit && DestTy == IceType_i32) {
   3030         T_1 = makeReg(IceType_i64);
   3031       } else {
   3032         assert(DestTy != IceType_i32);
   3033         T_1 = makeReg(IceType_i32);
   3034       }
   3035       Variable *T_2 = makeReg(DestTy);
   3036       if (isByteSizedType(DestTy)) {
   3037         assert(T_1->getType() == IceType_i32);
   3038         T_1->setRegClass(RCX86_Is32To8);
   3039         T_2->setRegClass(RCX86_IsTrunc8Rcvr);
   3040       }
   3041       _cvt(T_1, Src0RM, Traits::Insts::Cvt::Tss2si);
   3042       _mov(T_2, T_1); // T_1 and T_2 may have different integer types
   3043       if (DestTy == IceType_i1)
   3044         _and(T_2, Ctx->getConstantInt1(1));
   3045       _mov(Dest, T_2);
   3046     }
   3047     break;
   3048   case InstCast::Sitofp:
   3049     if (isVectorType(DestTy)) {
   3050       assert(DestTy == IceType_v4f32);
   3051       assert(Instr->getSrc(0)->getType() == IceType_v4i32);
   3052       Operand *Src0R = legalizeToReg(Instr->getSrc(0));
   3053       Variable *T = makeReg(DestTy);
   3054       _cvt(T, Src0R, Traits::Insts::Cvt::Dq2ps);
   3055       _movp(Dest, T);
   3056     } else if (!Traits::Is64Bit && Instr->getSrc(0)->getType() == IceType_i64) {
   3057       llvm::report_fatal_error("Helper call was expected");
   3058     } else {
   3059       Operand *Src0RM = legalize(Instr->getSrc(0), Legal_Reg | Legal_Mem);
   3060       // Sign-extend the operand.
   3061       // t1.i32 = movsx Src0RM; t2 = Cvt t1.i32; Dest = t2
   3062       Variable *T_1 = nullptr;
   3063       if (Traits::Is64Bit && Src0RM->getType() == IceType_i64) {
   3064         T_1 = makeReg(IceType_i64);
   3065       } else {
   3066         assert(Src0RM->getType() != IceType_i64);
   3067         T_1 = makeReg(IceType_i32);
   3068       }
   3069       Variable *T_2 = makeReg(DestTy);
   3070       if (Src0RM->getType() == T_1->getType())
   3071         _mov(T_1, Src0RM);
   3072       else
   3073         _movsx(T_1, Src0RM);
   3074       _cvt(T_2, T_1, Traits::Insts::Cvt::Si2ss);
   3075       _mov(Dest, T_2);
   3076     }
   3077     break;
   3078   case InstCast::Uitofp: {
   3079     Operand *Src0 = Instr->getSrc(0);
   3080     if (isVectorType(Src0->getType())) {
   3081       llvm::report_fatal_error("Helper call was expected");
   3082     } else if (Src0->getType() == IceType_i64 ||
   3083                (!Traits::Is64Bit && Src0->getType() == IceType_i32)) {
   3084       llvm::report_fatal_error("Helper call was expected");
   3085     } else {
   3086       Operand *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
   3087       // Zero-extend the operand.
   3088       // t1.i32 = movzx Src0RM; t2 = Cvt t1.i32; Dest = t2
   3089       Variable *T_1 = nullptr;
   3090       if (Traits::Is64Bit && Src0RM->getType() == IceType_i32) {
   3091         T_1 = makeReg(IceType_i64);
   3092       } else {
   3093         assert(Src0RM->getType() != IceType_i64);
   3094         assert(Traits::Is64Bit || Src0RM->getType() != IceType_i32);
   3095         T_1 = makeReg(IceType_i32);
   3096       }
   3097       Variable *T_2 = makeReg(DestTy);
   3098       if (Src0RM->getType() == T_1->getType())
   3099         _mov(T_1, Src0RM);
   3100       else
   3101         _movzx(T_1, Src0RM)->setMustKeep();
   3102       _cvt(T_2, T_1, Traits::Insts::Cvt::Si2ss);
   3103       _mov(Dest, T_2);
   3104     }
   3105     break;
   3106   }
   3107   case InstCast::Bitcast: {
   3108     Operand *Src0 = Instr->getSrc(0);
   3109     if (DestTy == Src0->getType()) {
   3110       auto *Assign = InstAssign::create(Func, Dest, Src0);
   3111       lowerAssign(Assign);
   3112       return;
   3113     }
   3114     switch (DestTy) {
   3115     default:
   3116       llvm_unreachable("Unexpected Bitcast dest type");
   3117     case IceType_i8: {
   3118       llvm::report_fatal_error("Helper call was expected");
   3119     } break;
   3120     case IceType_i16: {
   3121       llvm::report_fatal_error("Helper call was expected");
   3122     } break;
   3123     case IceType_i32:
   3124     case IceType_f32: {
   3125       Variable *Src0R = legalizeToReg(Src0);
   3126       Variable *T = makeReg(DestTy);
   3127       _movd(T, Src0R);
   3128       _mov(Dest, T);
   3129     } break;
   3130     case IceType_i64: {
   3131       assert(Src0->getType() == IceType_f64);
   3132       if (Traits::Is64Bit) {
   3133         Variable *Src0R = legalizeToReg(Src0);
   3134         Variable *T = makeReg(IceType_i64);
   3135         _movd(T, Src0R);
   3136         _mov(Dest, T);
   3137       } else {
   3138         Operand *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
   3139         // a.i64 = bitcast b.f64 ==>
   3140         //   s.f64 = spill b.f64
   3141         //   t_lo.i32 = lo(s.f64)
   3142         //   a_lo.i32 = t_lo.i32
   3143         //   t_hi.i32 = hi(s.f64)
   3144         //   a_hi.i32 = t_hi.i32
   3145         Operand *SpillLo, *SpillHi;
   3146         if (auto *Src0Var = llvm::dyn_cast<Variable>(Src0RM)) {
   3147           Variable *Spill = Func->makeVariable(IceType_f64);
   3148           Spill->setLinkedTo(Src0Var);
   3149           Spill->setMustNotHaveReg();
   3150           _movq(Spill, Src0RM);
   3151           SpillLo = Traits::VariableSplit::create(Func, Spill,
   3152                                                   Traits::VariableSplit::Low);
   3153           SpillHi = Traits::VariableSplit::create(Func, Spill,
   3154                                                   Traits::VariableSplit::High);
   3155         } else {
   3156           SpillLo = loOperand(Src0RM);
   3157           SpillHi = hiOperand(Src0RM);
   3158         }
   3159 
   3160         auto *DestLo = llvm::cast<Variable>(loOperand(Dest));
   3161         auto *DestHi = llvm::cast<Variable>(hiOperand(Dest));
   3162         Variable *T_Lo = makeReg(IceType_i32);
   3163         Variable *T_Hi = makeReg(IceType_i32);
   3164 
   3165         _mov(T_Lo, SpillLo);
   3166         _mov(DestLo, T_Lo);
   3167         _mov(T_Hi, SpillHi);
   3168         _mov(DestHi, T_Hi);
   3169       }
   3170     } break;
   3171     case IceType_f64: {
   3172       assert(Src0->getType() == IceType_i64);
   3173       if (Traits::Is64Bit) {
   3174         Operand *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
   3175         Variable *T = makeReg(IceType_f64);
   3176         _movd(T, Src0RM);
   3177         _mov(Dest, T);
   3178       } else {
   3179         Src0 = legalize(Src0);
   3180         if (llvm::isa<X86OperandMem>(Src0)) {
   3181           Variable *T = makeReg(DestTy);
   3182           _movq(T, Src0);
   3183           _movq(Dest, T);
   3184           break;
   3185         }
   3186         // a.f64 = bitcast b.i64 ==>
   3187         //   t_lo.i32 = b_lo.i32
   3188         //   FakeDef(s.f64)
   3189         //   lo(s.f64) = t_lo.i32
   3190         //   t_hi.i32 = b_hi.i32
   3191         //   hi(s.f64) = t_hi.i32
   3192         //   a.f64 = s.f64
   3193         Variable *Spill = Func->makeVariable(IceType_f64);
   3194         Spill->setLinkedTo(Dest);
   3195         Spill->setMustNotHaveReg();
   3196 
   3197         Variable *T_Lo = nullptr, *T_Hi = nullptr;
   3198         auto *SpillLo = Traits::VariableSplit::create(
   3199             Func, Spill, Traits::VariableSplit::Low);
   3200         auto *SpillHi = Traits::VariableSplit::create(
   3201             Func, Spill, Traits::VariableSplit::High);
   3202         _mov(T_Lo, loOperand(Src0));
   3203         // Technically, the Spill is defined after the _store happens, but
   3204         // SpillLo is considered a "use" of Spill so define Spill before it is
   3205         // used.
   3206         Context.insert<InstFakeDef>(Spill);
   3207         _store(T_Lo, SpillLo);
   3208         _mov(T_Hi, hiOperand(Src0));
   3209         _store(T_Hi, SpillHi);
   3210         _movq(Dest, Spill);
   3211       }
   3212     } break;
   3213     case IceType_v8i1: {
   3214       llvm::report_fatal_error("Helper call was expected");
   3215     } break;
   3216     case IceType_v16i1: {
   3217       llvm::report_fatal_error("Helper call was expected");
   3218     } break;
   3219     case IceType_v8i16:
   3220     case IceType_v16i8:
   3221     case IceType_v4i32:
   3222     case IceType_v4f32: {
   3223       if (Src0->getType() == IceType_i32) {
   3224         // Bitcast requires equal type sizes, which isn't strictly the case
   3225         // between scalars and vectors, but to emulate v4i8 vectors one has to
   3226         // use v16i8 vectors.
   3227         assert(getFlags().getApplicationBinaryInterface() != ABI_PNaCl &&
   3228                "PNaCl only supports real 128-bit vectors");
   3229         _movd(Dest, legalize(Src0, Legal_Reg | Legal_Mem));
   3230       } else {
   3231         _movp(Dest, legalizeToReg(Src0));
   3232       }
   3233     } break;
   3234     }
   3235     break;
   3236   }
   3237   }
   3238 }
   3239 
   3240 template <typename TraitsType>
   3241 void TargetX86Base<TraitsType>::lowerExtractElement(
   3242     const InstExtractElement *Instr) {
   3243   Operand *SourceVectNotLegalized = Instr->getSrc(0);
   3244   auto *ElementIndex = llvm::dyn_cast<ConstantInteger32>(Instr->getSrc(1));
   3245   // Only constant indices are allowed in PNaCl IR.
   3246   assert(ElementIndex);
   3247 
   3248   unsigned Index = ElementIndex->getValue();
   3249   Type Ty = SourceVectNotLegalized->getType();
   3250   Type ElementTy = typeElementType(Ty);
   3251   Type InVectorElementTy = Traits::getInVectorElementType(Ty);
   3252 
   3253   // TODO(wala): Determine the best lowering sequences for each type.
   3254   bool CanUsePextr = Ty == IceType_v8i16 || Ty == IceType_v8i1 ||
   3255                      (InstructionSet >= Traits::SSE4_1 && Ty != IceType_v4f32);
   3256   Variable *ExtractedElementR =
   3257       makeReg(CanUsePextr ? IceType_i32 : InVectorElementTy);
   3258   if (CanUsePextr) {
   3259     // Use pextrb, pextrw, or pextrd.  The "b" and "w" versions clear the upper
   3260     // bits of the destination register, so we represent this by always
   3261     // extracting into an i32 register.  The _mov into Dest below will do
   3262     // truncation as necessary.
   3263     Constant *Mask = Ctx->getConstantInt32(Index);
   3264     Variable *SourceVectR = legalizeToReg(SourceVectNotLegalized);
   3265     _pextr(ExtractedElementR, SourceVectR, Mask);
   3266   } else if (Ty == IceType_v4i32 || Ty == IceType_v4f32 || Ty == IceType_v4i1) {
   3267     // Use pshufd and movd/movss.
   3268     Variable *T = nullptr;
   3269     if (Index) {
   3270       // The shuffle only needs to occur if the element to be extracted is not
   3271       // at the lowest index.
   3272       Constant *Mask = Ctx->getConstantInt32(Index);
   3273       T = makeReg(Ty);
   3274       _pshufd(T, legalize(SourceVectNotLegalized, Legal_Reg | Legal_Mem), Mask);
   3275     } else {
   3276       T = legalizeToReg(SourceVectNotLegalized);
   3277     }
   3278 
   3279     if (InVectorElementTy == IceType_i32) {
   3280       _movd(ExtractedElementR, T);
   3281     } else { // Ty == IceType_f32
   3282       // TODO(wala): _movss is only used here because _mov does not allow a
   3283       // vector source and a scalar destination.  _mov should be able to be
   3284       // used here.
   3285       // _movss is a binary instruction, so the FakeDef is needed to keep the
   3286       // live range analysis consistent.
   3287       Context.insert<InstFakeDef>(ExtractedElementR);
   3288       _movss(ExtractedElementR, T);
   3289     }
   3290   } else {
   3291     assert(Ty == IceType_v16i8 || Ty == IceType_v16i1);
   3292     // Spill the value to a stack slot and do the extraction in memory.
   3293     //
   3294     // TODO(wala): use legalize(SourceVectNotLegalized, Legal_Mem) when support
   3295     // for legalizing to mem is implemented.
   3296     Variable *Slot = Func->makeVariable(Ty);
   3297     Slot->setMustNotHaveReg();
   3298     _movp(Slot, legalizeToReg(SourceVectNotLegalized));
   3299 
   3300     // Compute the location of the element in memory.
   3301     unsigned Offset = Index * typeWidthInBytes(InVectorElementTy);
   3302     X86OperandMem *Loc =
   3303         getMemoryOperandForStackSlot(InVectorElementTy, Slot, Offset);
   3304     _mov(ExtractedElementR, Loc);
   3305   }
   3306 
   3307   if (ElementTy == IceType_i1) {
   3308     // Truncate extracted integers to i1s if necessary.
   3309     Variable *T = makeReg(IceType_i1);
   3310     InstCast *Cast =
   3311         InstCast::create(Func, InstCast::Trunc, T, ExtractedElementR);
   3312     lowerCast(Cast);
   3313     ExtractedElementR = T;
   3314   }
   3315 
   3316   // Copy the element to the destination.
   3317   Variable *Dest = Instr->getDest();
   3318   _mov(Dest, ExtractedElementR);
   3319 }
   3320 
   3321 template <typename TraitsType>
   3322 void TargetX86Base<TraitsType>::lowerFcmp(const InstFcmp *Fcmp) {
   3323   Variable *Dest = Fcmp->getDest();
   3324 
   3325   if (isVectorType(Dest->getType())) {
   3326     lowerFcmpVector(Fcmp);
   3327   } else {
   3328     constexpr Inst *Consumer = nullptr;
   3329     lowerFcmpAndConsumer(Fcmp, Consumer);
   3330   }
   3331 }
   3332 
   3333 template <typename TraitsType>
   3334 void TargetX86Base<TraitsType>::lowerFcmpAndConsumer(const InstFcmp *Fcmp,
   3335                                                      const Inst *Consumer) {
   3336   Operand *Src0 = Fcmp->getSrc(0);
   3337   Operand *Src1 = Fcmp->getSrc(1);
   3338   Variable *Dest = Fcmp->getDest();
   3339 
   3340   if (Consumer != nullptr) {
   3341     if (auto *Select = llvm::dyn_cast<InstSelect>(Consumer)) {
   3342       if (lowerOptimizeFcmpSelect(Fcmp, Select))
   3343         return;
   3344     }
   3345   }
   3346 
   3347   if (isVectorType(Dest->getType())) {
   3348     lowerFcmp(Fcmp);
   3349     if (Consumer != nullptr)
   3350       lowerSelectVector(llvm::cast<InstSelect>(Consumer));
   3351     return;
   3352   }
   3353 
   3354   // Lowering a = fcmp cond, b, c
   3355   //   ucomiss b, c       /* only if C1 != Br_None */
   3356   //                      /* but swap b,c order if SwapOperands==true */
   3357   //   mov a, <default>
   3358   //   j<C1> label        /* only if C1 != Br_None */
   3359   //   j<C2> label        /* only if C2 != Br_None */
   3360   //   FakeUse(a)         /* only if C1 != Br_None */
   3361   //   mov a, !<default>  /* only if C1 != Br_None */
   3362   //   label:             /* only if C1 != Br_None */
   3363   //
   3364   // setcc lowering when C1 != Br_None && C2 == Br_None:
   3365   //   ucomiss b, c       /* but swap b,c order if SwapOperands==true */
   3366   //   setcc a, C1
   3367   InstFcmp::FCond Condition = Fcmp->getCondition();
   3368   assert(Condition < Traits::TableFcmpSize);
   3369   if (Traits::TableFcmp[Condition].SwapScalarOperands)
   3370     std::swap(Src0, Src1);
   3371   const bool HasC1 = (Traits::TableFcmp[Condition].C1 != Traits::Cond::Br_None);
   3372   const bool HasC2 = (Traits::TableFcmp[Condition].C2 != Traits::Cond::Br_None);
   3373   if (HasC1) {
   3374     Src0 = legalize(Src0);
   3375     Operand *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
   3376     Variable *T = nullptr;
   3377     _mov(T, Src0);
   3378     _ucomiss(T, Src1RM);
   3379     if (!HasC2) {
   3380       assert(Traits::TableFcmp[Condition].Default);
   3381       setccOrConsumer(Traits::TableFcmp[Condition].C1, Dest, Consumer);
   3382       return;
   3383     }
   3384   }
   3385   int32_t IntDefault = Traits::TableFcmp[Condition].Default;
   3386   if (Consumer == nullptr) {
   3387     Constant *Default = Ctx->getConstantInt(Dest->getType(), IntDefault);
   3388     _mov(Dest, Default);
   3389     if (HasC1) {
   3390       InstX86Label *Label = InstX86Label::create(Func, this);
   3391       _br(Traits::TableFcmp[Condition].C1, Label);
   3392       if (HasC2) {
   3393         _br(Traits::TableFcmp[Condition].C2, Label);
   3394       }
   3395       Constant *NonDefault = Ctx->getConstantInt(Dest->getType(), !IntDefault);
   3396       _redefined(_mov(Dest, NonDefault));
   3397       Context.insert(Label);
   3398     }
   3399     return;
   3400   }
   3401   if (const auto *Br = llvm::dyn_cast<InstBr>(Consumer)) {
   3402     CfgNode *TrueSucc = Br->getTargetTrue();
   3403     CfgNode *FalseSucc = Br->getTargetFalse();
   3404     if (IntDefault != 0)
   3405       std::swap(TrueSucc, FalseSucc);
   3406     if (HasC1) {
   3407       _br(Traits::TableFcmp[Condition].C1, FalseSucc);
   3408       if (HasC2) {
   3409         _br(Traits::TableFcmp[Condition].C2, FalseSucc);
   3410       }
   3411       _br(TrueSucc);
   3412       return;
   3413     }
   3414     _br(FalseSucc);
   3415     return;
   3416   }
   3417   if (auto *Select = llvm::dyn_cast<InstSelect>(Consumer)) {
   3418     Operand *SrcT = Select->getTrueOperand();
   3419     Operand *SrcF = Select->getFalseOperand();
   3420     Variable *SelectDest = Select->getDest();
   3421     if (IntDefault != 0)
   3422       std::swap(SrcT, SrcF);
   3423     lowerMove(SelectDest, SrcF, false);
   3424     if (HasC1) {
   3425       InstX86Label *Label = InstX86Label::create(Func, this);
   3426       _br(Traits::TableFcmp[Condition].C1, Label);
   3427       if (HasC2) {
   3428         _br(Traits::TableFcmp[Condition].C2, Label);
   3429       }
   3430       static constexpr bool IsRedefinition = true;
   3431       lowerMove(SelectDest, SrcT, IsRedefinition);
   3432       Context.insert(Label);
   3433     }
   3434     return;
   3435   }
   3436   llvm::report_fatal_error("Unexpected consumer type");
   3437 }
   3438 
   3439 template <typename TraitsType>
   3440 void TargetX86Base<TraitsType>::lowerFcmpVector(const InstFcmp *Fcmp) {
   3441   Operand *Src0 = Fcmp->getSrc(0);
   3442   Operand *Src1 = Fcmp->getSrc(1);
   3443   Variable *Dest = Fcmp->getDest();
   3444 
   3445   if (!isVectorType(Dest->getType()))
   3446     llvm::report_fatal_error("Expected vector compare");
   3447 
   3448   InstFcmp::FCond Condition = Fcmp->getCondition();
   3449   assert(Condition < Traits::TableFcmpSize);
   3450 
   3451   if (Traits::TableFcmp[Condition].SwapVectorOperands)
   3452     std::swap(Src0, Src1);
   3453 
   3454   Variable *T = nullptr;
   3455 
   3456   if (Condition == InstFcmp::True) {
   3457     // makeVectorOfOnes() requires an integer vector type.
   3458     T = makeVectorOfMinusOnes(IceType_v4i32);
   3459   } else if (Condition == InstFcmp::False) {
   3460     T = makeVectorOfZeros(Dest->getType());
   3461   } else {
   3462     Operand *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
   3463     Operand *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
   3464     if (llvm::isa<X86OperandMem>(Src1RM))
   3465       Src1RM = legalizeToReg(Src1RM);
   3466 
   3467     switch (Condition) {
   3468     default: {
   3469       const CmppsCond Predicate = Traits::TableFcmp[Condition].Predicate;
   3470       assert(Predicate != Traits::Cond::Cmpps_Invalid);
   3471       T = makeReg(Src0RM->getType());
   3472       _movp(T, Src0RM);
   3473       _cmpps(T, Src1RM, Predicate);
   3474     } break;
   3475     case InstFcmp::One: {
   3476       // Check both unequal and ordered.
   3477       T = makeReg(Src0RM->getType());
   3478       Variable *T2 = makeReg(Src0RM->getType());
   3479       _movp(T, Src0RM);
   3480       _cmpps(T, Src1RM, Traits::Cond::Cmpps_neq);
   3481       _movp(T2, Src0RM);
   3482       _cmpps(T2, Src1RM, Traits::Cond::Cmpps_ord);
   3483       _pand(T, T2);
   3484     } break;
   3485     case InstFcmp::Ueq: {
   3486       // Check both equal or unordered.
   3487       T = makeReg(Src0RM->getType());
   3488       Variable *T2 = makeReg(Src0RM->getType());
   3489       _movp(T, Src0RM);
   3490       _cmpps(T, Src1RM, Traits::Cond::Cmpps_eq);
   3491       _movp(T2, Src0RM);
   3492       _cmpps(T2, Src1RM, Traits::Cond::Cmpps_unord);
   3493       _por(T, T2);
   3494     } break;
   3495     }
   3496   }
   3497 
   3498   assert(T != nullptr);
   3499   _movp(Dest, T);
   3500   eliminateNextVectorSextInstruction(Dest);
   3501 }
   3502 
   3503 inline bool isZero(const Operand *Opnd) {
   3504   if (auto *C64 = llvm::dyn_cast<ConstantInteger64>(Opnd))
   3505     return C64->getValue() == 0;
   3506   if (auto *C32 = llvm::dyn_cast<ConstantInteger32>(Opnd))
   3507     return C32->getValue() == 0;
   3508   return false;
   3509 }
   3510 
   3511 template <typename TraitsType>
   3512 void TargetX86Base<TraitsType>::lowerIcmpAndConsumer(const InstIcmp *Icmp,
   3513                                                      const Inst *Consumer) {
   3514   Operand *Src0 = legalize(Icmp->getSrc(0));
   3515   Operand *Src1 = legalize(Icmp->getSrc(1));
   3516   Variable *Dest = Icmp->getDest();
   3517 
   3518   if (isVectorType(Dest->getType())) {
   3519     lowerIcmp(Icmp);
   3520     if (Consumer != nullptr)
   3521       lowerSelectVector(llvm::cast<InstSelect>(Consumer));
   3522     return;
   3523   }
   3524 
   3525   if (!Traits::Is64Bit && Src0->getType() == IceType_i64) {
   3526     lowerIcmp64(Icmp, Consumer);
   3527     return;
   3528   }
   3529 
   3530   // cmp b, c
   3531   if (isZero(Src1)) {
   3532     switch (Icmp->getCondition()) {
   3533     default:
   3534       break;
   3535     case InstIcmp::Uge:
   3536       movOrConsumer(true, Dest, Consumer);
   3537       return;
   3538     case InstIcmp::Ult:
   3539       movOrConsumer(false, Dest, Consumer);
   3540       return;
   3541     }
   3542   }
   3543   Operand *Src0RM = legalizeSrc0ForCmp(Src0, Src1);
   3544   _cmp(Src0RM, Src1);
   3545   setccOrConsumer(Traits::getIcmp32Mapping(Icmp->getCondition()), Dest,
   3546                   Consumer);
   3547 }
   3548 
   3549 template <typename TraitsType>
   3550 void TargetX86Base<TraitsType>::lowerIcmpVector(const InstIcmp *Icmp) {
   3551   Operand *Src0 = legalize(Icmp->getSrc(0));
   3552   Operand *Src1 = legalize(Icmp->getSrc(1));
   3553   Variable *Dest = Icmp->getDest();
   3554 
   3555   if (!isVectorType(Dest->getType()))
   3556     llvm::report_fatal_error("Expected a vector compare");
   3557 
   3558   Type Ty = Src0->getType();
   3559   // Promote i1 vectors to 128 bit integer vector types.
   3560   if (typeElementType(Ty) == IceType_i1) {
   3561     Type NewTy = IceType_NUM;
   3562     switch (Ty) {
   3563     default:
   3564       llvm::report_fatal_error("unexpected type");
   3565       break;
   3566     case IceType_v4i1:
   3567       NewTy = IceType_v4i32;
   3568       break;
   3569     case IceType_v8i1:
   3570       NewTy = IceType_v8i16;
   3571       break;
   3572     case IceType_v16i1:
   3573       NewTy = IceType_v16i8;
   3574       break;
   3575     }
   3576     Variable *NewSrc0 = Func->makeVariable(NewTy);
   3577     Variable *NewSrc1 = Func->makeVariable(NewTy);
   3578     lowerCast(InstCast::create(Func, InstCast::Sext, NewSrc0, Src0));
   3579     lowerCast(InstCast::create(Func, InstCast::Sext, NewSrc1, Src1));
   3580     Src0 = NewSrc0;
   3581     Src1 = NewSrc1;
   3582     Ty = NewTy;
   3583   }
   3584 
   3585   InstIcmp::ICond Condition = Icmp->getCondition();
   3586 
   3587   Operand *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
   3588   Operand *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
   3589 
   3590   // SSE2 only has signed comparison operations. Transform unsigned inputs in
   3591   // a manner that allows for the use of signed comparison operations by
   3592   // flipping the high order bits.
   3593   if (Condition == InstIcmp::Ugt || Condition == InstIcmp::Uge ||
   3594       Condition == InstIcmp::Ult || Condition == InstIcmp::Ule) {
   3595     Variable *T0 = makeReg(Ty);
   3596     Variable *T1 = makeReg(Ty);
   3597     Variable *HighOrderBits = makeVectorOfHighOrderBits(Ty);
   3598     _movp(T0, Src0RM);
   3599     _pxor(T0, HighOrderBits);
   3600     _movp(T1, Src1RM);
   3601     _pxor(T1, HighOrderBits);
   3602     Src0RM = T0;
   3603     Src1RM = T1;
   3604   }
   3605 
   3606   Variable *T = makeReg(Ty);
   3607   switch (Condition) {
   3608   default:
   3609     llvm_unreachable("unexpected condition");
   3610     break;
   3611   case InstIcmp::Eq: {
   3612     if (llvm::isa<X86OperandMem>(Src1RM))
   3613       Src1RM = legalizeToReg(Src1RM);
   3614     _movp(T, Src0RM);
   3615     _pcmpeq(T, Src1RM);
   3616   } break;
   3617   case InstIcmp::Ne: {
   3618     if (llvm::isa<X86OperandMem>(Src1RM))
   3619       Src1RM = legalizeToReg(Src1RM);
   3620     _movp(T, Src0RM);
   3621     _pcmpeq(T, Src1RM);
   3622     Variable *MinusOne = makeVectorOfMinusOnes(Ty);
   3623     _pxor(T, MinusOne);
   3624   } break;
   3625   case InstIcmp::Ugt:
   3626   case InstIcmp::Sgt: {
   3627     if (llvm::isa<X86OperandMem>(Src1RM))
   3628       Src1RM = legalizeToReg(Src1RM);
   3629     _movp(T, Src0RM);
   3630     _pcmpgt(T, Src1RM);
   3631   } break;
   3632   case InstIcmp::Uge:
   3633   case InstIcmp::Sge: {
   3634     // !(Src1RM > Src0RM)
   3635     if (llvm::isa<X86OperandMem>(Src0RM))
   3636       Src0RM = legalizeToReg(Src0RM);
   3637     _movp(T, Src1RM);
   3638     _pcmpgt(T, Src0RM);
   3639     Variable *MinusOne = makeVectorOfMinusOnes(Ty);
   3640     _pxor(T, MinusOne);
   3641   } break;
   3642   case InstIcmp::Ult:
   3643   case InstIcmp::Slt: {
   3644     if (llvm::isa<X86OperandMem>(Src0RM))
   3645       Src0RM = legalizeToReg(Src0RM);
   3646     _movp(T, Src1RM);
   3647     _pcmpgt(T, Src0RM);
   3648   } break;
   3649   case InstIcmp::Ule:
   3650   case InstIcmp::Sle: {
   3651     // !(Src0RM > Src1RM)
   3652     if (llvm::isa<X86OperandMem>(Src1RM))
   3653       Src1RM = legalizeToReg(Src1RM);
   3654     _movp(T, Src0RM);
   3655     _pcmpgt(T, Src1RM);
   3656     Variable *MinusOne = makeVectorOfMinusOnes(Ty);
   3657     _pxor(T, MinusOne);
   3658   } break;
   3659   }
   3660 
   3661   _movp(Dest, T);
   3662   eliminateNextVectorSextInstruction(Dest);
   3663 }
   3664 
   3665 template <typename TraitsType>
   3666 template <typename T>
   3667 typename std::enable_if<!T::Is64Bit, void>::type
   3668 TargetX86Base<TraitsType>::lowerIcmp64(const InstIcmp *Icmp,
   3669                                        const Inst *Consumer) {
   3670   // a=icmp cond, b, c ==> cmp b,c; a=1; br cond,L1; FakeUse(a); a=0; L1:
   3671   Operand *Src0 = legalize(Icmp->getSrc(0));
   3672   Operand *Src1 = legalize(Icmp->getSrc(1));
   3673   Variable *Dest = Icmp->getDest();
   3674   InstIcmp::ICond Condition = Icmp->getCondition();
   3675   assert(Condition < Traits::TableIcmp64Size);
   3676   Operand *Src0LoRM = nullptr;
   3677   Operand *Src0HiRM = nullptr;
   3678   // Legalize the portions of Src0 that are going to be needed.
   3679   if (isZero(Src1)) {
   3680     switch (Condition) {
   3681     default:
   3682       llvm_unreachable("unexpected condition");
   3683       break;
   3684     // These two are not optimized, so we fall through to the general case,
   3685     // which needs the upper and lower halves legalized.
   3686     case InstIcmp::Sgt:
   3687     case InstIcmp::Sle:
   3688     // These four compare after performing an "or" of the high and low half, so
   3689     // they need the upper and lower halves legalized.
   3690     case InstIcmp::Eq:
   3691     case InstIcmp::Ule:
   3692     case InstIcmp::Ne:
   3693     case InstIcmp::Ugt:
   3694       Src0LoRM = legalize(loOperand(Src0), Legal_Reg | Legal_Mem);
   3695     // These two test only the high half's sign bit, so they need only
   3696     // the upper half legalized.
   3697     case InstIcmp::Sge:
   3698     case InstIcmp::Slt:
   3699       Src0HiRM = legalize(hiOperand(Src0), Legal_Reg | Legal_Mem);
   3700       break;
   3701 
   3702     // These two move constants and hence need no legalization.
   3703     case InstIcmp::Uge:
   3704     case InstIcmp::Ult:
   3705       break;
   3706     }
   3707   } else {
   3708     Src0LoRM = legalize(loOperand(Src0), Legal_Reg | Legal_Mem);
   3709     Src0HiRM = legalize(hiOperand(Src0), Legal_Reg | Legal_Mem);
   3710   }
   3711   // Optimize comparisons with zero.
   3712   if (isZero(Src1)) {
   3713     Constant *SignMask = Ctx->getConstantInt32(0x80000000);
   3714     Variable *Temp = nullptr;
   3715     switch (Condition) {
   3716     default:
   3717       llvm_unreachable("unexpected condition");
   3718       break;
   3719     case InstIcmp::Eq:
   3720     case InstIcmp::Ule:
   3721       // Mov Src0HiRM first, because it was legalized most recently, and will
   3722       // sometimes avoid a move before the OR.
   3723       _mov(Temp, Src0HiRM);
   3724       _or(Temp, Src0LoRM);
   3725       Context.insert<InstFakeUse>(Temp);
   3726       setccOrConsumer(Traits::Cond::Br_e, Dest, Consumer);
   3727       return;
   3728     case InstIcmp::Ne:
   3729     case InstIcmp::Ugt:
   3730       // Mov Src0HiRM first, because it was legalized most recently, and will
   3731       // sometimes avoid a move before the OR.
   3732       _mov(Temp, Src0HiRM);
   3733       _or(Temp, Src0LoRM);
   3734       Context.insert<InstFakeUse>(Temp);
   3735       setccOrConsumer(Traits::Cond::Br_ne, Dest, Consumer);
   3736       return;
   3737     case InstIcmp::Uge:
   3738       movOrConsumer(true, Dest, Consumer);
   3739       return;
   3740     case InstIcmp::Ult:
   3741       movOrConsumer(false, Dest, Consumer);
   3742       return;
   3743     case InstIcmp::Sgt:
   3744       break;
   3745     case InstIcmp::Sge:
   3746       _test(Src0HiRM, SignMask);
   3747       setccOrConsumer(Traits::Cond::Br_e, Dest, Consumer);
   3748       return;
   3749     case InstIcmp::Slt:
   3750       _test(Src0HiRM, SignMask);
   3751       setccOrConsumer(Traits::Cond::Br_ne, Dest, Consumer);
   3752       return;
   3753     case InstIcmp::Sle:
   3754       break;
   3755     }
   3756   }
   3757   // Handle general compares.
   3758   Operand *Src1LoRI = legalize(loOperand(Src1), Legal_Reg | Legal_Imm);
   3759   Operand *Src1HiRI = legalize(hiOperand(Src1), Legal_Reg | Legal_Imm);
   3760   if (Consumer == nullptr) {
   3761     Constant *Zero = Ctx->getConstantInt(Dest->getType(), 0);
   3762     Constant *One = Ctx->getConstantInt(Dest->getType(), 1);
   3763     InstX86Label *LabelFalse = InstX86Label::create(Func, this);
   3764     InstX86Label *LabelTrue = InstX86Label::create(Func, this);
   3765     _mov(Dest, One);
   3766     _cmp(Src0HiRM, Src1HiRI);
   3767     if (Traits::TableIcmp64[Condition].C1 != Traits::Cond::Br_None)
   3768       _br(Traits::TableIcmp64[Condition].C1, LabelTrue);
   3769     if (Traits::TableIcmp64[Condition].C2 != Traits::Cond::Br_None)
   3770       _br(Traits::TableIcmp64[Condition].C2, LabelFalse);
   3771     _cmp(Src0LoRM, Src1LoRI);
   3772     _br(Traits::TableIcmp64[Condition].C3, LabelTrue);
   3773     Context.insert(LabelFalse);
   3774     _redefined(_mov(Dest, Zero));
   3775     Context.insert(LabelTrue);
   3776     return;
   3777   }
   3778   if (const auto *Br = llvm::dyn_cast<InstBr>(Consumer)) {
   3779     _cmp(Src0HiRM, Src1HiRI);
   3780     if (Traits::TableIcmp64[Condition].C1 != Traits::Cond::Br_None)
   3781       _br(Traits::TableIcmp64[Condition].C1, Br->getTargetTrue());
   3782     if (Traits::TableIcmp64[Condition].C2 != Traits::Cond::Br_None)
   3783       _br(Traits::TableIcmp64[Condition].C2, Br->getTargetFalse());
   3784     _cmp(Src0LoRM, Src1LoRI);
   3785     _br(Traits::TableIcmp64[Condition].C3, Br->getTargetTrue(),
   3786         Br->getTargetFalse());
   3787     return;
   3788   }
   3789   if (auto *Select = llvm::dyn_cast<InstSelect>(Consumer)) {
   3790     Operand *SrcT = Select->getTrueOperand();
   3791     Operand *SrcF = Select->getFalseOperand();
   3792     Variable *SelectDest = Select->getDest();
   3793     InstX86Label *LabelFalse = InstX86Label::create(Func, this);
   3794     InstX86Label *LabelTrue = InstX86Label::create(Func, this);
   3795     lowerMove(SelectDest, SrcT, false);
   3796     _cmp(Src0HiRM, Src1HiRI);
   3797     if (Traits::TableIcmp64[Condition].C1 != Traits::Cond::Br_None)
   3798       _br(Traits::TableIcmp64[Condition].C1, LabelTrue);
   3799     if (Traits::TableIcmp64[Condition].C2 != Traits::Cond::Br_None)
   3800       _br(Traits::TableIcmp64[Condition].C2, LabelFalse);
   3801     _cmp(Src0LoRM, Src1LoRI);
   3802     _br(Traits::TableIcmp64[Condition].C3, LabelTrue);
   3803     Context.insert(LabelFalse);
   3804     static constexpr bool IsRedefinition = true;
   3805     lowerMove(SelectDest, SrcF, IsRedefinition);
   3806     Context.insert(LabelTrue);
   3807     return;
   3808   }
   3809   llvm::report_fatal_error("Unexpected consumer type");
   3810 }
   3811 
   3812 template <typename TraitsType>
   3813 void TargetX86Base<TraitsType>::setccOrConsumer(BrCond Condition,
   3814                                                 Variable *Dest,
   3815                                                 const Inst *Consumer) {
   3816   if (Consumer == nullptr) {
   3817     _setcc(Dest, Condition);
   3818     return;
   3819   }
   3820   if (const auto *Br = llvm::dyn_cast<InstBr>(Consumer)) {
   3821     _br(Condition, Br->getTargetTrue(), Br->getTargetFalse());
   3822     return;
   3823   }
   3824   if (const auto *Select = llvm::dyn_cast<InstSelect>(Consumer)) {
   3825     Operand *SrcT = Select->getTrueOperand();
   3826     Operand *SrcF = Select->getFalseOperand();
   3827     Variable *SelectDest = Select->getDest();
   3828     lowerSelectMove(SelectDest, Condition, SrcT, SrcF);
   3829     return;
   3830   }
   3831   llvm::report_fatal_error("Unexpected consumer type");
   3832 }
   3833 
   3834 template <typename TraitsType>
   3835 void TargetX86Base<TraitsType>::movOrConsumer(bool IcmpResult, Variable *Dest,
   3836                                               const Inst *Consumer) {
   3837   if (Consumer == nullptr) {
   3838     _mov(Dest, Ctx->getConstantInt(Dest->getType(), (IcmpResult ? 1 : 0)));
   3839     return;
   3840   }
   3841   if (const auto *Br = llvm::dyn_cast<InstBr>(Consumer)) {
   3842     // TODO(sehr,stichnot): This could be done with a single unconditional
   3843     // branch instruction, but subzero doesn't know how to handle the resulting
   3844     // control flow graph changes now.  Make it do so to eliminate mov and cmp.
   3845     _mov(Dest, Ctx->getConstantInt(Dest->getType(), (IcmpResult ? 1 : 0)));
   3846     _cmp(Dest, Ctx->getConstantInt(Dest->getType(), 0));
   3847     _br(Traits::Cond::Br_ne, Br->getTargetTrue(), Br->getTargetFalse());
   3848     return;
   3849   }
   3850   if (const auto *Select = llvm::dyn_cast<InstSelect>(Consumer)) {
   3851     Operand *Src = nullptr;
   3852     if (IcmpResult) {
   3853       Src = legalize(Select->getTrueOperand(), Legal_Reg | Legal_Imm);
   3854     } else {
   3855       Src = legalize(Select->getFalseOperand(), Legal_Reg | Legal_Imm);
   3856     }
   3857     Variable *SelectDest = Select->getDest();
   3858     lowerMove(SelectDest, Src, false);
   3859     return;
   3860   }
   3861   llvm::report_fatal_error("Unexpected consumer type");
   3862 }
   3863 
   3864 template <typename TraitsType>
   3865 void TargetX86Base<TraitsType>::lowerArithAndConsumer(
   3866     const InstArithmetic *Arith, const Inst *Consumer) {
   3867   Variable *T = nullptr;
   3868   Operand *Src0 = legalize(Arith->getSrc(0));
   3869   Operand *Src1 = legalize(Arith->getSrc(1));
   3870   Variable *Dest = Arith->getDest();
   3871   switch (Arith->getOp()) {
   3872   default:
   3873     llvm_unreachable("arithmetic operator not AND or OR");
   3874     break;
   3875   case InstArithmetic::And:
   3876     _mov(T, Src0);
   3877     // Test cannot have an address in the second position.  Since T is
   3878     // guaranteed to be a register and Src1 could be a memory load, ensure
   3879     // that the second argument is a register.
   3880     if (llvm::isa<Constant>(Src1))
   3881       _test(T, Src1);
   3882     else
   3883       _test(Src1, T);
   3884     break;
   3885   case InstArithmetic::Or:
   3886     _mov(T, Src0);
   3887     _or(T, Src1);
   3888     break;
   3889   }
   3890 
   3891   if (Consumer == nullptr) {
   3892     llvm::report_fatal_error("Expected a consumer instruction");
   3893   }
   3894   if (const auto *Br = llvm::dyn_cast<InstBr>(Consumer)) {
   3895     Context.insert<InstFakeUse>(T);
   3896     Context.insert<InstFakeDef>(Dest);
   3897     _br(Traits::Cond::Br_ne, Br->getTargetTrue(), Br->getTargetFalse());
   3898     return;
   3899   }
   3900   llvm::report_fatal_error("Unexpected consumer type");
   3901 }
   3902 
   3903 template <typename TraitsType>
   3904 void TargetX86Base<TraitsType>::lowerInsertElement(
   3905     const InstInsertElement *Instr) {
   3906   Operand *SourceVectNotLegalized = Instr->getSrc(0);
   3907   Operand *ElementToInsertNotLegalized = Instr->getSrc(1);
   3908   auto *ElementIndex = llvm::dyn_cast<ConstantInteger32>(Instr->getSrc(2));
   3909   // Only constant indices are allowed in PNaCl IR.
   3910   assert(ElementIndex);
   3911   unsigned Index = ElementIndex->getValue();
   3912   assert(Index < typeNumElements(SourceVectNotLegalized->getType()));
   3913 
   3914   Type Ty = SourceVectNotLegalized->getType();
   3915   Type ElementTy = typeElementType(Ty);
   3916   Type InVectorElementTy = Traits::getInVectorElementType(Ty);
   3917 
   3918   if (ElementTy == IceType_i1) {
   3919     // Expand the element to the appropriate size for it to be inserted in the
   3920     // vector.
   3921     Variable *Expanded = Func->makeVariable(InVectorElementTy);
   3922     auto *Cast = InstCast::create(Func, InstCast::Zext, Expanded,
   3923                                   ElementToInsertNotLegalized);
   3924     lowerCast(Cast);
   3925     ElementToInsertNotLegalized = Expanded;
   3926   }
   3927 
   3928   if (Ty == IceType_v8i16 || Ty == IceType_v8i1 ||
   3929       InstructionSet >= Traits::SSE4_1) {
   3930     // Use insertps, pinsrb, pinsrw, or pinsrd.
   3931     Operand *ElementRM =
   3932         legalize(ElementToInsertNotLegalized, Legal_Reg | Legal_Mem);
   3933     Operand *SourceVectRM =
   3934         legalize(SourceVectNotLegalized, Legal_Reg | Legal_Mem);
   3935     Variable *T = makeReg(Ty);
   3936     _movp(T, SourceVectRM);
   3937     if (Ty == IceType_v4f32) {
   3938       _insertps(T, ElementRM, Ctx->getConstantInt32(Index << 4));
   3939     } else {
   3940       // For the pinsrb and pinsrw instructions, when the source operand is a
   3941       // register, it must be a full r32 register like eax, and not ax/al/ah.
   3942       // For filetype=asm, InstX86Pinsr<TraitsType>::emit() compensates for
   3943       // the use
   3944       // of r16 and r8 by converting them through getBaseReg(), while emitIAS()
   3945       // validates that the original and base register encodings are the same.
   3946       if (ElementRM->getType() == IceType_i8 &&
   3947           llvm::isa<Variable>(ElementRM)) {
   3948         // Don't use ah/bh/ch/dh for pinsrb.
   3949         ElementRM = copyToReg8(ElementRM);
   3950       }
   3951       _pinsr(T, ElementRM, Ctx->getConstantInt32(Index));
   3952     }
   3953     _movp(Instr->getDest(), T);
   3954   } else if (Ty == IceType_v4i32 || Ty == IceType_v4f32 || Ty == IceType_v4i1) {
   3955     // Use shufps or movss.
   3956     Variable *ElementR = nullptr;
   3957     Operand *SourceVectRM =
   3958         legalize(SourceVectNotLegalized, Legal_Reg | Legal_Mem);
   3959 
   3960     if (InVectorElementTy == IceType_f32) {
   3961       // ElementR will be in an XMM register since it is floating point.
   3962       ElementR = legalizeToReg(ElementToInsertNotLegalized);
   3963     } else {
   3964       // Copy an integer to an XMM register.
   3965       Operand *T = legalize(ElementToInsertNotLegalized, Legal_Reg | Legal_Mem);
   3966       ElementR = makeReg(Ty);
   3967       _movd(ElementR, T);
   3968     }
   3969 
   3970     if (Index == 0) {
   3971       Variable *T = makeReg(Ty);
   3972       _movp(T, SourceVectRM);
   3973       _movss(T, ElementR);
   3974       _movp(Instr->getDest(), T);
   3975       return;
   3976     }
   3977 
   3978     // shufps treats the source and destination operands as vectors of four
   3979     // doublewords. The destination's two high doublewords are selected from
   3980     // the source operand and the two low doublewords are selected from the
   3981     // (original value of) the destination operand. An insertelement operation
   3982     // can be effected with a sequence of two shufps operations with
   3983     // appropriate masks. In all cases below, Element[0] is being inserted into
   3984     // SourceVectOperand. Indices are ordered from left to right.
   3985     //
   3986     // insertelement into index 1 (result is stored in ElementR):
   3987     //   ElementR := ElementR[0, 0] SourceVectRM[0, 0]
   3988     //   ElementR := ElementR[3, 0] SourceVectRM[2, 3]
   3989     //
   3990     // insertelement into index 2 (result is stored in T):
   3991     //   T := SourceVectRM
   3992     //   ElementR := ElementR[0, 0] T[0, 3]
   3993     //   T := T[0, 1] ElementR[0, 3]
   3994     //
   3995     // insertelement into index 3 (result is stored in T):
   3996     //   T := SourceVectRM
   3997     //   ElementR := ElementR[0, 0] T[0, 2]
   3998     //   T := T[0, 1] ElementR[3, 0]
   3999     const unsigned char Mask1[3] = {0, 192, 128};
   4000     const unsigned char Mask2[3] = {227, 196, 52};
   4001 
   4002     Constant *Mask1Constant = Ctx->getConstantInt32(Mask1[Index - 1]);
   4003     Constant *Mask2Constant = Ctx->getConstantInt32(Mask2[Index - 1]);
   4004 
   4005     if (Index == 1) {
   4006       _shufps(ElementR, SourceVectRM, Mask1Constant);
   4007       _shufps(ElementR, SourceVectRM, Mask2Constant);
   4008       _movp(Instr->getDest(), ElementR);
   4009     } else {
   4010       Variable *T = makeReg(Ty);
   4011       _movp(T, SourceVectRM);
   4012       _shufps(ElementR, T, Mask1Constant);
   4013       _shufps(T, ElementR, Mask2Constant);
   4014       _movp(Instr->getDest(), T);
   4015     }
   4016   } else {
   4017     assert(Ty == IceType_v16i8 || Ty == IceType_v16i1);
   4018     // Spill the value to a stack slot and perform the insertion in memory.
   4019     //
   4020     // TODO(wala): use legalize(SourceVectNotLegalized, Legal_Mem) when support
   4021     // for legalizing to mem is implemented.
   4022     Variable *Slot = Func->makeVariable(Ty);
   4023     Slot->setMustNotHaveReg();
   4024     _movp(Slot, legalizeToReg(SourceVectNotLegalized));
   4025 
   4026     // Compute the location of the position to insert in memory.
   4027     unsigned Offset = Index * typeWidthInBytes(InVectorElementTy);
   4028     X86OperandMem *Loc =
   4029         getMemoryOperandForStackSlot(InVectorElementTy, Slot, Offset);
   4030     _store(legalizeToReg(ElementToInsertNotLegalized), Loc);
   4031 
   4032     Variable *T = makeReg(Ty);
   4033     _movp(T, Slot);
   4034     _movp(Instr->getDest(), T);
   4035   }
   4036 }
   4037 
   4038 template <typename TraitsType>
   4039 void TargetX86Base<TraitsType>::lowerIntrinsicCall(
   4040     const InstIntrinsicCall *Instr) {
   4041   switch (Intrinsics::IntrinsicID ID = Instr->getIntrinsicInfo().ID) {
   4042   case Intrinsics::AtomicCmpxchg: {
   4043     if (!Intrinsics::isMemoryOrderValid(
   4044             ID, getConstantMemoryOrder(Instr->getArg(3)),
   4045             getConstantMemoryOrder(Instr->getArg(4)))) {
   4046       Func->setError("Unexpected memory ordering for AtomicCmpxchg");
   4047       return;
   4048     }
   4049     Variable *DestPrev = Instr->getDest();
   4050     Operand *PtrToMem = legalize(Instr->getArg(0));
   4051     Operand *Expected = legalize(Instr->getArg(1));
   4052     Operand *Desired = legalize(Instr->getArg(2));
   4053     if (tryOptimizedCmpxchgCmpBr(DestPrev, PtrToMem, Expected, Desired))
   4054       return;
   4055     lowerAtomicCmpxchg(DestPrev, PtrToMem, Expected, Desired);
   4056     return;
   4057   }
   4058   case Intrinsics::AtomicFence:
   4059     if (!Intrinsics::isMemoryOrderValid(
   4060             ID, getConstantMemoryOrder(Instr->getArg(0)))) {
   4061       Func->setError("Unexpected memory ordering for AtomicFence");
   4062       return;
   4063     }
   4064     _mfence();
   4065     return;
   4066   case Intrinsics::AtomicFenceAll:
   4067     // NOTE: FenceAll should prevent and load/store from being moved across the
   4068     // fence (both atomic and non-atomic). The InstX8632Mfence instruction is
   4069     // currently marked coarsely as "HasSideEffects".
   4070     _mfence();
   4071     return;
   4072   case Intrinsics::AtomicIsLockFree: {
   4073     // X86 is always lock free for 8/16/32/64 bit accesses.
   4074     // TODO(jvoung): Since the result is constant when given a constant byte
   4075     // size, this opens up DCE opportunities.
   4076     Operand *ByteSize = Instr->getArg(0);
   4077     Variable *Dest = Instr->getDest();
   4078     if (auto *CI = llvm::dyn_cast<ConstantInteger32>(ByteSize)) {
   4079       Constant *Result;
   4080       switch (CI->getValue()) {
   4081       default:
   4082         // Some x86-64 processors support the cmpxchg16b instruction, which can
   4083         // make 16-byte operations lock free (when used with the LOCK prefix).
   4084         // However, that's not supported in 32-bit mode, so just return 0 even
   4085         // for large sizes.
   4086         Result = Ctx->getConstantZero(IceType_i32);
   4087         break;
   4088       case 1:
   4089       case 2:
   4090       case 4:
   4091       case 8:
   4092         Result = Ctx->getConstantInt32(1);
   4093         break;
   4094       }
   4095       _mov(Dest, Result);
   4096       return;
   4097     }
   4098     // The PNaCl ABI requires the byte size to be a compile-time constant.
   4099     Func->setError("AtomicIsLockFree byte size should be compile-time const");
   4100     return;
   4101   }
   4102   case Intrinsics::AtomicLoad: {
   4103     // We require the memory address to be naturally aligned. Given that is the
   4104     // case, then normal loads are atomic.
   4105     if (!Intrinsics::isMemoryOrderValid(
   4106             ID, getConstantMemoryOrder(Instr->getArg(1)))) {
   4107       Func->setError("Unexpected memory ordering for AtomicLoad");
   4108       return;
   4109     }
   4110     Variable *Dest = Instr->getDest();
   4111     if (!Traits::Is64Bit) {
   4112       if (auto *Dest64On32 = llvm::dyn_cast<Variable64On32>(Dest)) {
   4113         // Follow what GCC does and use a movq instead of what lowerLoad()
   4114         // normally does (split the load into two). Thus, this skips
   4115         // load/arithmetic op folding. Load/arithmetic folding can't happen
   4116         // anyway, since this is x86-32 and integer arithmetic only happens on
   4117         // 32-bit quantities.
   4118         Variable *T = makeReg(IceType_f64);
   4119         X86OperandMem *Addr = formMemoryOperand(Instr->getArg(0), IceType_f64);
   4120         _movq(T, Addr);
   4121         // Then cast the bits back out of the XMM register to the i64 Dest.
   4122         auto *Cast = InstCast::create(Func, InstCast::Bitcast, Dest, T);
   4123         lowerCast(Cast);
   4124         // Make sure that the atomic load isn't elided when unused.
   4125         Context.insert<InstFakeUse>(Dest64On32->getLo());
   4126         Context.insert<InstFakeUse>(Dest64On32->getHi());
   4127         return;
   4128       }
   4129     }
   4130     auto *Load = InstLoad::create(Func, Dest, Instr->getArg(0));
   4131     lowerLoad(Load);
   4132     // Make sure the atomic load isn't elided when unused, by adding a FakeUse.
   4133     // Since lowerLoad may fuse the load w/ an arithmetic instruction, insert
   4134     // the FakeUse on the last-inserted instruction's dest.
   4135     Context.insert<InstFakeUse>(Context.getLastInserted()->getDest());
   4136     return;
   4137   }
   4138   case Intrinsics::AtomicRMW:
   4139     if (!Intrinsics::isMemoryOrderValid(
   4140             ID, getConstantMemoryOrder(Instr->getArg(3)))) {
   4141       Func->setError("Unexpected memory ordering for AtomicRMW");
   4142       return;
   4143     }
   4144     lowerAtomicRMW(
   4145         Instr->getDest(),
   4146         static_cast<uint32_t>(
   4147             llvm::cast<ConstantInteger32>(Instr->getArg(0))->getValue()),
   4148         Instr->getArg(1), Instr->getArg(2));
   4149     return;
   4150   case Intrinsics::AtomicStore: {
   4151     if (!Intrinsics::isMemoryOrderValid(
   4152             ID, getConstantMemoryOrder(Instr->getArg(2)))) {
   4153       Func->setError("Unexpected memory ordering for AtomicStore");
   4154       return;
   4155     }
   4156     // We require the memory address to be naturally aligned. Given that is the
   4157     // case, then normal stores are atomic. Add a fence after the store to make
   4158     // it visible.
   4159     Operand *Value = Instr->getArg(0);
   4160     Operand *Ptr = Instr->getArg(1);
   4161     if (!Traits::Is64Bit && Value->getType() == IceType_i64) {
   4162       // Use a movq instead of what lowerStore() normally does (split the store
   4163       // into two), following what GCC does. Cast the bits from int -> to an
   4164       // xmm register first.
   4165       Variable *T = makeReg(IceType_f64);
   4166       auto *Cast = InstCast::create(Func, InstCast::Bitcast, T, Value);
   4167       lowerCast(Cast);
   4168       // Then store XMM w/ a movq.
   4169       X86OperandMem *Addr = formMemoryOperand(Ptr, IceType_f64);
   4170       _storeq(T, Addr);
   4171       _mfence();
   4172       return;
   4173     }
   4174     auto *Store = InstStore::create(Func, Value, Ptr);
   4175     lowerStore(Store);
   4176     _mfence();
   4177     return;
   4178   }
   4179   case Intrinsics::Bswap: {
   4180     Variable *Dest = Instr->getDest();
   4181     Operand *Val = Instr->getArg(0);
   4182     // In 32-bit mode, bswap only works on 32-bit arguments, and the argument
   4183     // must be a register. Use rotate left for 16-bit bswap.
   4184     if (!Traits::Is64Bit && Val->getType() == IceType_i64) {
   4185       Val = legalizeUndef(Val);
   4186       Variable *T_Lo = legalizeToReg(loOperand(Val));
   4187       Variable *T_Hi = legalizeToReg(hiOperand(Val));
   4188       auto *DestLo = llvm::cast<Variable>(loOperand(Dest));
   4189       auto *DestHi = llvm::cast<Variable>(hiOperand(Dest));
   4190       _bswap(T_Lo);
   4191       _bswap(T_Hi);
   4192       _mov(DestLo, T_Hi);
   4193       _mov(DestHi, T_Lo);
   4194     } else if ((Traits::Is64Bit && Val->getType() == IceType_i64) ||
   4195                Val->getType() == IceType_i32) {
   4196       Variable *T = legalizeToReg(Val);
   4197       _bswap(T);
   4198       _mov(Dest, T);
   4199     } else {
   4200       assert(Val->getType() == IceType_i16);
   4201       Constant *Eight = Ctx->getConstantInt16(8);
   4202       Variable *T = nullptr;
   4203       Val = legalize(Val);
   4204       _mov(T, Val);
   4205       _rol(T, Eight);
   4206       _mov(Dest, T);
   4207     }
   4208     return;
   4209   }
   4210   case Intrinsics::Ctpop: {
   4211     Variable *Dest = Instr->getDest();
   4212     Variable *T = nullptr;
   4213     Operand *Val = Instr->getArg(0);
   4214     Type ValTy = Val->getType();
   4215     assert(ValTy == IceType_i32 || ValTy == IceType_i64);
   4216 
   4217     if (!Traits::Is64Bit) {
   4218       T = Dest;
   4219     } else {
   4220       T = makeReg(IceType_i64);
   4221       if (ValTy == IceType_i32) {
   4222         // in x86-64, __popcountsi2 is not defined, so we cheat a bit by
   4223         // converting it to a 64-bit value, and using ctpop_i64. _movzx should
   4224         // ensure we will not have any bits set on Val's upper 32 bits.
   4225         Variable *V = makeReg(IceType_i64);
   4226         Operand *ValRM = legalize(Val, Legal_Reg | Legal_Mem);
   4227         _movzx(V, ValRM);
   4228         Val = V;
   4229       }
   4230       ValTy = IceType_i64;
   4231     }
   4232 
   4233     InstCall *Call =
   4234         makeHelperCall(ValTy == IceType_i32 ? RuntimeHelper::H_call_ctpop_i32
   4235                                             : RuntimeHelper::H_call_ctpop_i64,
   4236                        T, 1);
   4237     Call->addArg(Val);
   4238     lowerCall(Call);
   4239     // The popcount helpers always return 32-bit values, while the intrinsic's
   4240     // signature matches the native POPCNT instruction and fills a 64-bit reg
   4241     // (in 64-bit mode). Thus, clear the upper bits of the dest just in case
   4242     // the user doesn't do that in the IR. If the user does that in the IR,
   4243     // then this zero'ing instruction is dead and gets optimized out.
   4244     if (!Traits::Is64Bit) {
   4245       assert(T == Dest);
   4246       if (Val->getType() == IceType_i64) {
   4247         auto *DestHi = llvm::cast<Variable>(hiOperand(Dest));
   4248         Constant *Zero = Ctx->getConstantZero(IceType_i32);
   4249         _mov(DestHi, Zero);
   4250       }
   4251     } else {
   4252       assert(Val->getType() == IceType_i64);
   4253       // T is 64 bit. It needs to be copied to dest. We need to:
   4254       //
   4255       // T_1.32 = trunc T.64 to i32
   4256       // T_2.64 = zext T_1.32 to i64
   4257       // Dest.<<right_size>> = T_2.<<right_size>>
   4258       //
   4259       // which ensures the upper 32 bits will always be cleared. Just doing a
   4260       //
   4261       // mov Dest.32 = trunc T.32 to i32
   4262       //
   4263       // is dangerous because there's a chance the compiler will optimize this
   4264       // copy out. To use _movzx we need two new registers (one 32-, and
   4265       // another 64-bit wide.)
   4266       Variable *T_1 = makeReg(IceType_i32);
   4267       _mov(T_1, T);
   4268       Variable *T_2 = makeReg(IceType_i64);
   4269       _movzx(T_2, T_1);
   4270       _mov(Dest, T_2);
   4271     }
   4272     return;
   4273   }
   4274   case Intrinsics::Ctlz: {
   4275     // The "is zero undef" parameter is ignored and we always return a
   4276     // well-defined value.
   4277     Operand *Val = legalize(Instr->getArg(0));
   4278     Operand *FirstVal;
   4279     Operand *SecondVal = nullptr;
   4280     if (!Traits::Is64Bit && Val->getType() == IceType_i64) {
   4281       FirstVal = loOperand(Val);
   4282       SecondVal = hiOperand(Val);
   4283     } else {
   4284       FirstVal = Val;
   4285     }
   4286     constexpr bool IsCttz = false;
   4287     lowerCountZeros(IsCttz, Val->getType(), Instr->getDest(), FirstVal,
   4288                     SecondVal);
   4289     return;
   4290   }
   4291   case Intrinsics::Cttz: {
   4292     // The "is zero undef" parameter is ignored and we always return a
   4293     // well-defined value.
   4294     Operand *Val = legalize(Instr->getArg(0));
   4295     Operand *FirstVal;
   4296     Operand *SecondVal = nullptr;
   4297     if (!Traits::Is64Bit && Val->getType() == IceType_i64) {
   4298       FirstVal = hiOperand(Val);
   4299       SecondVal = loOperand(Val);
   4300     } else {
   4301       FirstVal = Val;
   4302     }
   4303     constexpr bool IsCttz = true;
   4304     lowerCountZeros(IsCttz, Val->getType(), Instr->getDest(), FirstVal,
   4305                     SecondVal);
   4306     return;
   4307   }
   4308   case Intrinsics::Fabs: {
   4309     Operand *Src = legalize(Instr->getArg(0));
   4310     Type Ty = Src->getType();
   4311     Variable *Dest = Instr->getDest();
   4312     Variable *T = makeVectorOfFabsMask(Ty);
   4313     // The pand instruction operates on an m128 memory operand, so if Src is an
   4314     // f32 or f64, we need to make sure it's in a register.
   4315     if (isVectorType(Ty)) {
   4316       if (llvm::isa<X86OperandMem>(Src))
   4317         Src = legalizeToReg(Src);
   4318     } else {
   4319       Src = legalizeToReg(Src);
   4320     }
   4321     _pand(T, Src);
   4322     if (isVectorType(Ty))
   4323       _movp(Dest, T);
   4324     else
   4325       _mov(Dest, T);
   4326     return;
   4327   }
   4328   case Intrinsics::Longjmp: {
   4329     InstCall *Call = makeHelperCall(RuntimeHelper::H_call_longjmp, nullptr, 2);
   4330     Call->addArg(Instr->getArg(0));
   4331     Call->addArg(Instr->getArg(1));
   4332     lowerCall(Call);
   4333     return;
   4334   }
   4335   case Intrinsics::Memcpy: {
   4336     lowerMemcpy(Instr->getArg(0), Instr->getArg(1), Instr->getArg(2));
   4337     return;
   4338   }
   4339   case Intrinsics::Memmove: {
   4340     lowerMemmove(Instr->getArg(0), Instr->getArg(1), Instr->getArg(2));
   4341     return;
   4342   }
   4343   case Intrinsics::Memset: {
   4344     lowerMemset(Instr->getArg(0), Instr->getArg(1), Instr->getArg(2));
   4345     return;
   4346   }
   4347   case Intrinsics::NaClReadTP: {
   4348     if (NeedSandboxing) {
   4349       Operand *Src =
   4350           dispatchToConcrete(&ConcreteTarget::createNaClReadTPSrcOperand);
   4351       Variable *Dest = Instr->getDest();
   4352       Variable *T = nullptr;
   4353       _mov(T, Src);
   4354       _mov(Dest, T);
   4355     } else {
   4356       InstCall *Call =
   4357           makeHelperCall(RuntimeHelper::H_call_read_tp, Instr->getDest(), 0);
   4358       lowerCall(Call);
   4359     }
   4360     return;
   4361   }
   4362   case Intrinsics::Setjmp: {
   4363     InstCall *Call =
   4364         makeHelperCall(RuntimeHelper::H_call_setjmp, Instr->getDest(), 1);
   4365     Call->addArg(Instr->getArg(0));
   4366     lowerCall(Call);
   4367     return;
   4368   }
   4369   case Intrinsics::Sqrt: {
   4370     assert(isScalarFloatingType(Instr->getDest()->getType()) ||
   4371            getFlags().getApplicationBinaryInterface() != ::Ice::ABI_PNaCl);
   4372     Operand *Src = legalize(Instr->getArg(0));
   4373     Variable *Dest = Instr->getDest();
   4374     Variable *T = makeReg(Dest->getType());
   4375     _sqrt(T, Src);
   4376     _mov(Dest, T);
   4377     return;
   4378   }
   4379   case Intrinsics::Stacksave: {
   4380     if (!Traits::Is64Bit || !NeedSandboxing) {
   4381       Variable *esp = Func->getTarget()->getPhysicalRegister(getStackReg(),
   4382                                                              Traits::WordType);
   4383       Variable *Dest = Instr->getDest();
   4384       _mov(Dest, esp);
   4385       return;
   4386     }
   4387     Variable *esp = Func->getTarget()->getPhysicalRegister(
   4388         Traits::RegisterSet::Reg_esp, IceType_i32);
   4389     Variable *Dest = Instr->getDest();
   4390     _mov(Dest, esp);
   4391 
   4392     return;
   4393   }
   4394   case Intrinsics::Stackrestore: {
   4395     Operand *Src = Instr->getArg(0);
   4396     _mov_sp(Src);
   4397     return;
   4398   }
   4399 
   4400   case Intrinsics::Trap:
   4401     _ud2();
   4402     return;
   4403   case Intrinsics::LoadSubVector: {
   4404     assert(llvm::isa<ConstantInteger32>(Instr->getArg(1)) &&
   4405            "LoadSubVector second argument must be a constant");
   4406     Variable *Dest = Instr->getDest();
   4407     Type Ty = Dest->getType();
   4408     auto *SubVectorSize = llvm::cast<ConstantInteger32>(Instr->getArg(1));
   4409     Operand *Addr = Instr->getArg(0);
   4410     X86OperandMem *Src = formMemoryOperand(Addr, Ty);
   4411     doMockBoundsCheck(Src);
   4412 
   4413     if (Dest->isRematerializable()) {
   4414       Context.insert<InstFakeDef>(Dest);
   4415       return;
   4416     }
   4417 
   4418     auto *T = makeReg(Ty);
   4419     switch (SubVectorSize->getValue()) {
   4420     case 4:
   4421       _movd(T, Src);
   4422       break;
   4423     case 8:
   4424       _movq(T, Src);
   4425       break;
   4426     default:
   4427       Func->setError("Unexpected size for LoadSubVector");
   4428       return;
   4429     }
   4430     _movp(Dest, T);
   4431     return;
   4432   }
   4433   case Intrinsics::StoreSubVector: {
   4434     assert(llvm::isa<ConstantInteger32>(Instr->getArg(2)) &&
   4435            "StoreSubVector third argument must be a constant");
   4436     auto *SubVectorSize = llvm::cast<ConstantInteger32>(Instr->getArg(2));
   4437     Operand *Value = Instr->getArg(0);
   4438     Operand *Addr = Instr->getArg(1);
   4439     X86OperandMem *NewAddr = formMemoryOperand(Addr, Value->getType());
   4440     doMockBoundsCheck(NewAddr);
   4441 
   4442     Value = legalizeToReg(Value);
   4443 
   4444     switch (SubVectorSize->getValue()) {
   4445     case 4:
   4446       _stored(Value, NewAddr);
   4447       break;
   4448     case 8:
   4449       _storeq(Value, NewAddr);
   4450       break;
   4451     default:
   4452       Func->setError("Unexpected size for StoreSubVector");
   4453       return;
   4454     }
   4455     return;
   4456   }
   4457   case Intrinsics::VectorPackSigned: {
   4458     Operand *Src0 = Instr->getArg(0);
   4459     Operand *Src1 = Instr->getArg(1);
   4460     Variable *Dest = Instr->getDest();
   4461     auto *T = makeReg(Src0->getType());
   4462     auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
   4463     auto *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
   4464     _movp(T, Src0RM);
   4465     _packss(T, Src1RM);
   4466     _movp(Dest, T);
   4467     return;
   4468   }
   4469   case Intrinsics::VectorPackUnsigned: {
   4470     Operand *Src0 = Instr->getArg(0);
   4471     Operand *Src1 = Instr->getArg(1);
   4472     Variable *Dest = Instr->getDest();
   4473     auto *T = makeReg(Src0->getType());
   4474     auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
   4475     auto *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
   4476     _movp(T, Src0RM);
   4477     _packus(T, Src1RM);
   4478     _movp(Dest, T);
   4479     return;
   4480   }
   4481   case Intrinsics::SignMask: {
   4482     Operand *SrcReg = legalizeToReg(Instr->getArg(0));
   4483     Variable *Dest = Instr->getDest();
   4484     Variable *T = makeReg(IceType_i32);
   4485     if (SrcReg->getType() == IceType_v4f32 ||
   4486         SrcReg->getType() == IceType_v4i32 ||
   4487         SrcReg->getType() == IceType_v16i8) {
   4488       _movmsk(T, SrcReg);
   4489     } else {
   4490       // TODO(capn): We could implement v8i16 sign mask using packsswb/pmovmskb
   4491       llvm::report_fatal_error("Invalid type for SignMask intrinsic");
   4492     }
   4493     _mov(Dest, T);
   4494     return;
   4495   }
   4496   case Intrinsics::MultiplyHighSigned: {
   4497     Operand *Src0 = Instr->getArg(0);
   4498     Operand *Src1 = Instr->getArg(1);
   4499     Variable *Dest = Instr->getDest();
   4500     auto *T = makeReg(Dest->getType());
   4501     auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
   4502     auto *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
   4503     _movp(T, Src0RM);
   4504     _pmulhw(T, Src1RM);
   4505     _movp(Dest, T);
   4506     return;
   4507   }
   4508   case Intrinsics::MultiplyHighUnsigned: {
   4509     Operand *Src0 = Instr->getArg(0);
   4510     Operand *Src1 = Instr->getArg(1);
   4511     Variable *Dest = Instr->getDest();
   4512     auto *T = makeReg(Dest->getType());
   4513     auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
   4514     auto *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
   4515     _movp(T, Src0RM);
   4516     _pmulhuw(T, Src1RM);
   4517     _movp(Dest, T);
   4518     return;
   4519   }
   4520   case Intrinsics::MultiplyAddPairs: {
   4521     Operand *Src0 = Instr->getArg(0);
   4522     Operand *Src1 = Instr->getArg(1);
   4523     Variable *Dest = Instr->getDest();
   4524     auto *T = makeReg(Dest->getType());
   4525     auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
   4526     auto *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
   4527     _movp(T, Src0RM);
   4528     _pmaddwd(T, Src1RM);
   4529     _movp(Dest, T);
   4530     return;
   4531   }
   4532   case Intrinsics::AddSaturateSigned: {
   4533     Operand *Src0 = Instr->getArg(0);
   4534     Operand *Src1 = Instr->getArg(1);
   4535     Variable *Dest = Instr->getDest();
   4536     auto *T = makeReg(Dest->getType());
   4537     auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
   4538     auto *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
   4539     _movp(T, Src0RM);
   4540     _padds(T, Src1RM);
   4541     _movp(Dest, T);
   4542     return;
   4543   }
   4544   case Intrinsics::SubtractSaturateSigned: {
   4545     Operand *Src0 = Instr->getArg(0);
   4546     Operand *Src1 = Instr->getArg(1);
   4547     Variable *Dest = Instr->getDest();
   4548     auto *T = makeReg(Dest->getType());
   4549     auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
   4550     auto *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
   4551     _movp(T, Src0RM);
   4552     _psubs(T, Src1RM);
   4553     _movp(Dest, T);
   4554     return;
   4555   }
   4556   case Intrinsics::AddSaturateUnsigned: {
   4557     Operand *Src0 = Instr->getArg(0);
   4558     Operand *Src1 = Instr->getArg(1);
   4559     Variable *Dest = Instr->getDest();
   4560     auto *T = makeReg(Dest->getType());
   4561     auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
   4562     auto *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
   4563     _movp(T, Src0RM);
   4564     _paddus(T, Src1RM);
   4565     _movp(Dest, T);
   4566     return;
   4567   }
   4568   case Intrinsics::SubtractSaturateUnsigned: {
   4569     Operand *Src0 = Instr->getArg(0);
   4570     Operand *Src1 = Instr->getArg(1);
   4571     Variable *Dest = Instr->getDest();
   4572     auto *T = makeReg(Dest->getType());
   4573     auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
   4574     auto *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
   4575     _movp(T, Src0RM);
   4576     _psubus(T, Src1RM);
   4577     _movp(Dest, T);
   4578     return;
   4579   }
   4580   case Intrinsics::Nearbyint: {
   4581     Operand *Src = Instr->getArg(0);
   4582     Variable *Dest = Instr->getDest();
   4583     Type DestTy = Dest->getType();
   4584     if (isVectorType(DestTy)) {
   4585       assert(DestTy == IceType_v4i32);
   4586       assert(Src->getType() == IceType_v4f32);
   4587       Operand *Src0R = legalizeToReg(Src);
   4588       Variable *T = makeReg(DestTy);
   4589       _cvt(T, Src0R, Traits::Insts::Cvt::Ps2dq);
   4590       _movp(Dest, T);
   4591     } else if (!Traits::Is64Bit && DestTy == IceType_i64) {
   4592       llvm::report_fatal_error("Helper call was expected");
   4593     } else {
   4594       Operand *Src0RM = legalize(Src, Legal_Reg | Legal_Mem);
   4595       // t1.i32 = cvt Src0RM; t2.dest_type = t1; Dest = t2.dest_type
   4596       Variable *T_1 = nullptr;
   4597       if (Traits::Is64Bit && DestTy == IceType_i64) {
   4598         T_1 = makeReg(IceType_i64);
   4599       } else {
   4600         assert(DestTy != IceType_i64);
   4601         T_1 = makeReg(IceType_i32);
   4602       }
   4603       // cvt() requires its integer argument to be a GPR.
   4604       Variable *T_2 = makeReg(DestTy);
   4605       if (isByteSizedType(DestTy)) {
   4606         assert(T_1->getType() == IceType_i32);
   4607         T_1->setRegClass(RCX86_Is32To8);
   4608         T_2->setRegClass(RCX86_IsTrunc8Rcvr);
   4609       }
   4610       _cvt(T_1, Src0RM, Traits::Insts::Cvt::Ss2si);
   4611       _mov(T_2, T_1); // T_1 and T_2 may have different integer types
   4612       if (DestTy == IceType_i1)
   4613         _and(T_2, Ctx->getConstantInt1(1));
   4614       _mov(Dest, T_2);
   4615     }
   4616     return;
   4617   }
   4618   case Intrinsics::Round: {
   4619     assert(InstructionSet >= Traits::SSE4_1);
   4620     Variable *Dest = Instr->getDest();
   4621     Operand *Src = Instr->getArg(0);
   4622     Operand *Mode = Instr->getArg(1);
   4623     assert(llvm::isa<ConstantInteger32>(Mode) &&
   4624            "Round last argument must be a constant");
   4625     auto *SrcRM = legalize(Src, Legal_Reg | Legal_Mem);
   4626     int32_t Imm = llvm::cast<ConstantInteger32>(Mode)->getValue();
   4627     (void)Imm;
   4628     assert(Imm >= 0 && Imm < 4 && "Invalid rounding mode");
   4629     auto *T = makeReg(Dest->getType());
   4630     _round(T, SrcRM, Mode);
   4631     _movp(Dest, T);
   4632     return;
   4633   }
   4634   default: // UnknownIntrinsic
   4635     Func->setError("Unexpected intrinsic");
   4636     return;
   4637   }
   4638   return;
   4639 }
   4640 
   4641 template <typename TraitsType>
   4642 void TargetX86Base<TraitsType>::lowerAtomicCmpxchg(Variable *DestPrev,
   4643                                                    Operand *Ptr,
   4644                                                    Operand *Expected,
   4645                                                    Operand *Desired) {
   4646   Type Ty = Expected->getType();
   4647   if (!Traits::Is64Bit && Ty == IceType_i64) {
   4648     // Reserve the pre-colored registers first, before adding any more
   4649     // infinite-weight variables from formMemoryOperand's legalization.
   4650     Variable *T_edx = makeReg(IceType_i32, Traits::RegisterSet::Reg_edx);
   4651     Variable *T_eax = makeReg(IceType_i32, Traits::RegisterSet::Reg_eax);
   4652     Variable *T_ecx = makeReg(IceType_i32, Traits::RegisterSet::Reg_ecx);
   4653     Variable *T_ebx = makeReg(IceType_i32, Traits::RegisterSet::Reg_ebx);
   4654     _mov(T_eax, loOperand(Expected));
   4655     _mov(T_edx, hiOperand(Expected));
   4656     _mov(T_ebx, loOperand(Desired));
   4657     _mov(T_ecx, hiOperand(Desired));
   4658     X86OperandMem *Addr = formMemoryOperand(Ptr, Ty);
   4659     constexpr bool Locked = true;
   4660     _cmpxchg8b(Addr, T_edx, T_eax, T_ecx, T_ebx, Locked);
   4661     auto *DestLo = llvm::cast<Variable>(loOperand(DestPrev));
   4662     auto *DestHi = llvm::cast<Variable>(hiOperand(DestPrev));
   4663     _mov(DestLo, T_eax);
   4664     _mov(DestHi, T_edx);
   4665     return;
   4666   }
   4667   RegNumT Eax;
   4668   switch (Ty) {
   4669   default:
   4670     llvm::report_fatal_error("Bad type for cmpxchg");
   4671   case IceType_i64:
   4672     Eax = Traits::getRaxOrDie();
   4673     break;
   4674   case IceType_i32:
   4675     Eax = Traits::RegisterSet::Reg_eax;
   4676     break;
   4677   case IceType_i16:
   4678     Eax = Traits::RegisterSet::Reg_ax;
   4679     break;
   4680   case IceType_i8:
   4681     Eax = Traits::RegisterSet::Reg_al;
   4682     break;
   4683   }
   4684   Variable *T_eax = makeReg(Ty, Eax);
   4685   _mov(T_eax, Expected);
   4686   X86OperandMem *Addr = formMemoryOperand(Ptr, Ty);
   4687   Variable *DesiredReg = legalizeToReg(Desired);
   4688   constexpr bool Locked = true;
   4689   _cmpxchg(Addr, T_eax, DesiredReg, Locked);
   4690   _mov(DestPrev, T_eax);
   4691 }
   4692 
   4693 template <typename TraitsType>
   4694 bool TargetX86Base<TraitsType>::tryOptimizedCmpxchgCmpBr(Variable *Dest,
   4695                                                          Operand *PtrToMem,
   4696                                                          Operand *Expected,
   4697                                                          Operand *Desired) {
   4698   if (Func->getOptLevel() == Opt_m1)
   4699     return false;
   4700   // Peek ahead a few instructions and see how Dest is used.
   4701   // It's very common to have:
   4702   //
   4703   // %x = call i32 @llvm.nacl.atomic.cmpxchg.i32(i32* ptr, i32 %expected, ...)
   4704   // [%y_phi = ...] // list of phi stores
   4705   // %p = icmp eq i32 %x, %expected
   4706   // br i1 %p, label %l1, label %l2
   4707   //
   4708   // which we can optimize into:
   4709   //
   4710   // %x = <cmpxchg code>
   4711   // [%y_phi = ...] // list of phi stores
   4712   // br eq, %l1, %l2
   4713   InstList::iterator I = Context.getCur();
   4714   // I is currently the InstIntrinsicCall. Peek past that.
   4715   // This assumes that the atomic cmpxchg has not been lowered yet,
   4716   // so that the instructions seen in the scan from "Cur" is simple.
   4717   assert(llvm::isa<InstIntrinsicCall>(*I));
   4718   Inst *NextInst = Context.getNextInst(I);
   4719   if (!NextInst)
   4720     return false;
   4721   // There might be phi assignments right before the compare+branch, since this
   4722   // could be a backward branch for a loop. This placement of assignments is
   4723   // determined by placePhiStores().
   4724   CfgVector<InstAssign *> PhiAssigns;
   4725   while (auto *PhiAssign = llvm::dyn_cast<InstAssign>(NextInst)) {
   4726     if (PhiAssign->getDest() == Dest)
   4727       return false;
   4728     PhiAssigns.push_back(PhiAssign);
   4729     NextInst = Context.getNextInst(I);
   4730     if (!NextInst)
   4731       return false;
   4732   }
   4733   if (auto *NextCmp = llvm::dyn_cast<InstIcmp>(NextInst)) {
   4734     if (!(NextCmp->getCondition() == InstIcmp::Eq &&
   4735           ((NextCmp->getSrc(0) == Dest && NextCmp->getSrc(1) == Expected) ||
   4736            (NextCmp->getSrc(1) == Dest && NextCmp->getSrc(0) == Expected)))) {
   4737       return false;
   4738     }
   4739     NextInst = Context.getNextInst(I);
   4740     if (!NextInst)
   4741       return false;
   4742     if (auto *NextBr = llvm::dyn_cast<InstBr>(NextInst)) {
   4743       if (!NextBr->isUnconditional() &&
   4744           NextCmp->getDest() == NextBr->getCondition() &&
   4745           NextBr->isLastUse(NextCmp->getDest())) {
   4746         lowerAtomicCmpxchg(Dest, PtrToMem, Expected, Desired);
   4747         for (size_t i = 0; i < PhiAssigns.size(); ++i) {
   4748           // Lower the phi assignments now, before the branch (same placement
   4749           // as before).
   4750           InstAssign *PhiAssign = PhiAssigns[i];
   4751           PhiAssign->setDeleted();
   4752           lowerAssign(PhiAssign);
   4753           Context.advanceNext();
   4754         }
   4755         _br(Traits::Cond::Br_e, NextBr->getTargetTrue(),
   4756             NextBr->getTargetFalse());
   4757         // Skip over the old compare and branch, by deleting them.
   4758         NextCmp->setDeleted();
   4759         NextBr->setDeleted();
   4760         Context.advanceNext();
   4761         Context.advanceNext();
   4762         return true;
   4763       }
   4764     }
   4765   }
   4766   return false;
   4767 }
   4768 
   4769 template <typename TraitsType>
   4770 void TargetX86Base<TraitsType>::lowerAtomicRMW(Variable *Dest,
   4771                                                uint32_t Operation, Operand *Ptr,
   4772                                                Operand *Val) {
   4773   bool NeedsCmpxchg = false;
   4774   LowerBinOp Op_Lo = nullptr;
   4775   LowerBinOp Op_Hi = nullptr;
   4776   switch (Operation) {
   4777   default:
   4778     Func->setError("Unknown AtomicRMW operation");
   4779     return;
   4780   case Intrinsics::AtomicAdd: {
   4781     if (!Traits::Is64Bit && Dest->getType() == IceType_i64) {
   4782       // All the fall-through paths must set this to true, but use this
   4783       // for asserting.
   4784       NeedsCmpxchg = true;
   4785       Op_Lo = &TargetX86Base<TraitsType>::_add;
   4786       Op_Hi = &TargetX86Base<TraitsType>::_adc;
   4787       break;
   4788     }
   4789     X86OperandMem *Addr = formMemoryOperand(Ptr, Dest->getType());
   4790     constexpr bool Locked = true;
   4791     Variable *T = nullptr;
   4792     _mov(T, Val);
   4793     _xadd(Addr, T, Locked);
   4794     _mov(Dest, T);
   4795     return;
   4796   }
   4797   case Intrinsics::AtomicSub: {
   4798     if (!Traits::Is64Bit && Dest->getType() == IceType_i64) {
   4799       NeedsCmpxchg = true;
   4800       Op_Lo = &TargetX86Base<TraitsType>::_sub;
   4801       Op_Hi = &TargetX86Base<TraitsType>::_sbb;
   4802       break;
   4803     }
   4804     X86OperandMem *Addr = formMemoryOperand(Ptr, Dest->getType());
   4805     constexpr bool Locked = true;
   4806     Variable *T = nullptr;
   4807     _mov(T, Val);
   4808     _neg(T);
   4809     _xadd(Addr, T, Locked);
   4810     _mov(Dest, T);
   4811     return;
   4812   }
   4813   case Intrinsics::AtomicOr:
   4814     // TODO(jvoung): If Dest is null or dead, then some of these
   4815     // operations do not need an "exchange", but just a locked op.
   4816     // That appears to be "worth" it for sub, or, and, and xor.
   4817     // xadd is probably fine vs lock add for add, and xchg is fine
   4818     // vs an atomic store.
   4819     NeedsCmpxchg = true;
   4820     Op_Lo = &TargetX86Base<TraitsType>::_or;
   4821     Op_Hi = &TargetX86Base<TraitsType>::_or;
   4822     break;
   4823   case Intrinsics::AtomicAnd:
   4824     NeedsCmpxchg = true;
   4825     Op_Lo = &TargetX86Base<TraitsType>::_and;
   4826     Op_Hi = &TargetX86Base<TraitsType>::_and;
   4827     break;
   4828   case Intrinsics::AtomicXor:
   4829     NeedsCmpxchg = true;
   4830     Op_Lo = &TargetX86Base<TraitsType>::_xor;
   4831     Op_Hi = &TargetX86Base<TraitsType>::_xor;
   4832     break;
   4833   case Intrinsics::AtomicExchange:
   4834     if (!Traits::Is64Bit && Dest->getType() == IceType_i64) {
   4835       NeedsCmpxchg = true;
   4836       // NeedsCmpxchg, but no real Op_Lo/Op_Hi need to be done. The values
   4837       // just need to be moved to the ecx and ebx registers.
   4838       Op_Lo = nullptr;
   4839       Op_Hi = nullptr;
   4840       break;
   4841     }
   4842     X86OperandMem *Addr = formMemoryOperand(Ptr, Dest->getType());
   4843     Variable *T = nullptr;
   4844     _mov(T, Val);
   4845     _xchg(Addr, T);
   4846     _mov(Dest, T);
   4847     return;
   4848   }
   4849   // Otherwise, we need a cmpxchg loop.
   4850   (void)NeedsCmpxchg;
   4851   assert(NeedsCmpxchg);
   4852   expandAtomicRMWAsCmpxchg(Op_Lo, Op_Hi, Dest, Ptr, Val);
   4853 }
   4854 
   4855 template <typename TraitsType>
   4856 void TargetX86Base<TraitsType>::expandAtomicRMWAsCmpxchg(LowerBinOp Op_Lo,
   4857                                                          LowerBinOp Op_Hi,
   4858                                                          Variable *Dest,
   4859                                                          Operand *Ptr,
   4860                                                          Operand *Val) {
   4861   // Expand a more complex RMW operation as a cmpxchg loop:
   4862   // For 64-bit:
   4863   //   mov     eax, [ptr]
   4864   //   mov     edx, [ptr + 4]
   4865   // .LABEL:
   4866   //   mov     ebx, eax
   4867   //   <Op_Lo> ebx, <desired_adj_lo>
   4868   //   mov     ecx, edx
   4869   //   <Op_Hi> ecx, <desired_adj_hi>
   4870   //   lock cmpxchg8b [ptr]
   4871   //   jne     .LABEL
   4872   //   mov     <dest_lo>, eax
   4873   //   mov     <dest_lo>, edx
   4874   //
   4875   // For 32-bit:
   4876   //   mov     eax, [ptr]
   4877   // .LABEL:
   4878   //   mov     <reg>, eax
   4879   //   op      <reg>, [desired_adj]
   4880   //   lock cmpxchg [ptr], <reg>
   4881   //   jne     .LABEL
   4882   //   mov     <dest>, eax
   4883   //
   4884   // If Op_{Lo,Hi} are nullptr, then just copy the value.
   4885   Val = legalize(Val);
   4886   Type Ty = Val->getType();
   4887   if (!Traits::Is64Bit && Ty == IceType_i64) {
   4888     Variable *T_edx = makeReg(IceType_i32, Traits::RegisterSet::Reg_edx);
   4889     Variable *T_eax = makeReg(IceType_i32, Traits::RegisterSet::Reg_eax);
   4890     X86OperandMem *Addr = formMemoryOperand(Ptr, Ty);
   4891     _mov(T_eax, loOperand(Addr));
   4892     _mov(T_edx, hiOperand(Addr));
   4893     Variable *T_ecx = makeReg(IceType_i32, Traits::RegisterSet::Reg_ecx);
   4894     Variable *T_ebx = makeReg(IceType_i32, Traits::RegisterSet::Reg_ebx);
   4895     InstX86Label *Label = InstX86Label::create(Func, this);
   4896     const bool IsXchg8b = Op_Lo == nullptr && Op_Hi == nullptr;
   4897     if (!IsXchg8b) {
   4898       Context.insert(Label);
   4899       _mov(T_ebx, T_eax);
   4900       (this->*Op_Lo)(T_ebx, loOperand(Val));
   4901       _mov(T_ecx, T_edx);
   4902       (this->*Op_Hi)(T_ecx, hiOperand(Val));
   4903     } else {
   4904       // This is for xchg, which doesn't need an actual Op_Lo/Op_Hi.
   4905       // It just needs the Val loaded into ebx and ecx.
   4906       // That can also be done before the loop.
   4907       _mov(T_ebx, loOperand(Val));
   4908       _mov(T_ecx, hiOperand(Val));
   4909       Context.insert(Label);
   4910     }
   4911     constexpr bool Locked = true;
   4912     _cmpxchg8b(Addr, T_edx, T_eax, T_ecx, T_ebx, Locked);
   4913     _br(Traits::Cond::Br_ne, Label);
   4914     if (!IsXchg8b) {
   4915       // If Val is a variable, model the extended live range of Val through
   4916       // the end of the loop, since it will be re-used by the loop.
   4917       if (auto *ValVar = llvm::dyn_cast<Variable>(Val)) {
   4918         auto *ValLo = llvm::cast<Variable>(loOperand(ValVar));
   4919         auto *ValHi = llvm::cast<Variable>(hiOperand(ValVar));
   4920         Context.insert<InstFakeUse>(ValLo);
   4921         Context.insert<InstFakeUse>(ValHi);
   4922       }
   4923     } else {
   4924       // For xchg, the loop is slightly smaller and ebx/ecx are used.
   4925       Context.insert<InstFakeUse>(T_ebx);
   4926       Context.insert<InstFakeUse>(T_ecx);
   4927     }
   4928     // The address base (if any) is also reused in the loop.
   4929     if (Variable *Base = Addr->getBase())
   4930       Context.insert<InstFakeUse>(Base);
   4931     auto *DestLo = llvm::cast<Variable>(loOperand(Dest));
   4932     auto *DestHi = llvm::cast<Variable>(hiOperand(Dest));
   4933     _mov(DestLo, T_eax);
   4934     _mov(DestHi, T_edx);
   4935     return;
   4936   }
   4937   X86OperandMem *Addr = formMemoryOperand(Ptr, Ty);
   4938   RegNumT Eax;
   4939   switch (Ty) {
   4940   default:
   4941     llvm::report_fatal_error("Bad type for atomicRMW");
   4942   case IceType_i64:
   4943     Eax = Traits::getRaxOrDie();
   4944     break;
   4945   case IceType_i32:
   4946     Eax = Traits::RegisterSet::Reg_eax;
   4947     break;
   4948   case IceType_i16:
   4949     Eax = Traits::RegisterSet::Reg_ax;
   4950     break;
   4951   case IceType_i8:
   4952     Eax = Traits::RegisterSet::Reg_al;
   4953     break;
   4954   }
   4955   Variable *T_eax = makeReg(Ty, Eax);
   4956   _mov(T_eax, Addr);
   4957   auto *Label = Context.insert<InstX86Label>(this);
   4958   // We want to pick a different register for T than Eax, so don't use
   4959   // _mov(T == nullptr, T_eax).
   4960   Variable *T = makeReg(Ty);
   4961   _mov(T, T_eax);
   4962   (this->*Op_Lo)(T, Val);
   4963   constexpr bool Locked = true;
   4964   _cmpxchg(Addr, T_eax, T, Locked);
   4965   _br(Traits::Cond::Br_ne, Label);
   4966   // If Val is a variable, model the extended live range of Val through
   4967   // the end of the loop, since it will be re-used by the loop.
   4968   if (auto *ValVar = llvm::dyn_cast<Variable>(Val)) {
   4969     Context.insert<InstFakeUse>(ValVar);
   4970   }
   4971   // The address base (if any) is also reused in the loop.
   4972   if (Variable *Base = Addr->getBase())
   4973     Context.insert<InstFakeUse>(Base);
   4974   _mov(Dest, T_eax);
   4975 }
   4976 
   4977 /// Lowers count {trailing, leading} zeros intrinsic.
   4978 ///
   4979 /// We could do constant folding here, but that should have
   4980 /// been done by the front-end/middle-end optimizations.
   4981 template <typename TraitsType>
   4982 void TargetX86Base<TraitsType>::lowerCountZeros(bool Cttz, Type Ty,
   4983                                                 Variable *Dest,
   4984                                                 Operand *FirstVal,
   4985                                                 Operand *SecondVal) {
   4986   // TODO(jvoung): Determine if the user CPU supports LZCNT (BMI).
   4987   // Then the instructions will handle the Val == 0 case much more simply
   4988   // and won't require conversion from bit position to number of zeros.
   4989   //
   4990   // Otherwise:
   4991   //   bsr IF_NOT_ZERO, Val
   4992   //   mov T_DEST, ((Ty == i32) ? 63 : 127)
   4993   //   cmovne T_DEST, IF_NOT_ZERO
   4994   //   xor T_DEST, ((Ty == i32) ? 31 : 63)
   4995   //   mov DEST, T_DEST
   4996   //
   4997   // NOTE: T_DEST must be a register because cmov requires its dest to be a
   4998   // register. Also, bsf and bsr require their dest to be a register.
   4999   //
   5000   // The xor DEST, C(31|63) converts a bit position to # of leading zeroes.
   5001   // E.g., for 000... 00001100, bsr will say that the most significant bit
   5002   // set is at position 3, while the number of leading zeros is 28. Xor is
   5003   // like (M - N) for N <= M, and converts 63 to 32, and 127 to 64 (for the
   5004   // all-zeros case).
   5005   //
   5006   // X8632 only: Similar for 64-bit, but start w/ speculating that the upper 32
   5007   // bits are all zero, and compute the result for that case (checking the
   5008   // lower 32 bits). Then actually compute the result for the upper bits and
   5009   // cmov in the result from the lower computation if the earlier speculation
   5010   // was correct.
   5011   //
   5012   // Cttz, is similar, but uses bsf instead, and doesn't require the xor
   5013   // bit position conversion, and the speculation is reversed.
   5014 
   5015   // TODO(jpp): refactor this method.
   5016   assert(Ty == IceType_i32 || Ty == IceType_i64);
   5017   const Type DestTy = Traits::Is64Bit ? Dest->getType() : IceType_i32;
   5018   Variable *T = makeReg(DestTy);
   5019   Operand *FirstValRM = legalize(FirstVal, Legal_Mem | Legal_Reg);
   5020   if (Cttz) {
   5021     _bsf(T, FirstValRM);
   5022   } else {
   5023     _bsr(T, FirstValRM);
   5024   }
   5025   Variable *T_Dest = makeReg(DestTy);
   5026   Constant *_31 = Ctx->getConstantInt32(31);
   5027   Constant *_32 = Ctx->getConstantInt(DestTy, 32);
   5028   Constant *_63 = Ctx->getConstantInt(DestTy, 63);
   5029   Constant *_64 = Ctx->getConstantInt(DestTy, 64);
   5030   if (Cttz) {
   5031     if (DestTy == IceType_i64) {
   5032       _mov(T_Dest, _64);
   5033     } else {
   5034       _mov(T_Dest, _32);
   5035     }
   5036   } else {
   5037     Constant *_127 = Ctx->getConstantInt(DestTy, 127);
   5038     if (DestTy == IceType_i64) {
   5039       _mov(T_Dest, _127);
   5040     } else {
   5041       _mov(T_Dest, _63);
   5042     }
   5043   }
   5044   _cmov(T_Dest, T, Traits::Cond::Br_ne);
   5045   if (!Cttz) {
   5046     if (DestTy == IceType_i64) {
   5047       // Even though there's a _63 available at this point, that constant might
   5048       // not be an i32, which will cause the xor emission to fail.
   5049       Constant *_63 = Ctx->getConstantInt32(63);
   5050       _xor(T_Dest, _63);
   5051     } else {
   5052       _xor(T_Dest, _31);
   5053     }
   5054   }
   5055   if (Traits::Is64Bit || Ty == IceType_i32) {
   5056     _mov(Dest, T_Dest);
   5057     return;
   5058   }
   5059   _add(T_Dest, _32);
   5060   auto *DestLo = llvm::cast<Variable>(loOperand(Dest));
   5061   auto *DestHi = llvm::cast<Variable>(hiOperand(Dest));
   5062   // Will be using "test" on this, so we need a registerized variable.
   5063   Variable *SecondVar = legalizeToReg(SecondVal);
   5064   Variable *T_Dest2 = makeReg(IceType_i32);
   5065   if (Cttz) {
   5066     _bsf(T_Dest2, SecondVar);
   5067   } else {
   5068     _bsr(T_Dest2, SecondVar);
   5069     _xor(T_Dest2, _31);
   5070   }
   5071   _test(SecondVar, SecondVar);
   5072   _cmov(T_Dest2, T_Dest, Traits::Cond::Br_e);
   5073   _mov(DestLo, T_Dest2);
   5074   _mov(DestHi, Ctx->getConstantZero(IceType_i32));
   5075 }
   5076 
   5077 template <typename TraitsType>
   5078 void TargetX86Base<TraitsType>::typedLoad(Type Ty, Variable *Dest,
   5079                                           Variable *Base, Constant *Offset) {
   5080   // If Offset is a ConstantRelocatable in Non-SFI mode, we will need to
   5081   // legalize Mem properly.
   5082   if (Offset)
   5083     assert(!llvm::isa<ConstantRelocatable>(Offset));
   5084 
   5085   auto *Mem = X86OperandMem::create(Func, Ty, Base, Offset);
   5086 
   5087   if (isVectorType(Ty))
   5088     _movp(Dest, Mem);
   5089   else if (Ty == IceType_f64)
   5090     _movq(Dest, Mem);
   5091   else
   5092     _mov(Dest, Mem);
   5093 }
   5094 
   5095 template <typename TraitsType>
   5096 void TargetX86Base<TraitsType>::typedStore(Type Ty, Variable *Value,
   5097                                            Variable *Base, Constant *Offset) {
   5098   // If Offset is a ConstantRelocatable in Non-SFI mode, we will need to
   5099   // legalize Mem properly.
   5100   if (Offset)
   5101     assert(!llvm::isa<ConstantRelocatable>(Offset));
   5102 
   5103   auto *Mem = X86OperandMem::create(Func, Ty, Base, Offset);
   5104 
   5105   if (isVectorType(Ty))
   5106     _storep(Value, Mem);
   5107   else if (Ty == IceType_f64)
   5108     _storeq(Value, Mem);
   5109   else
   5110     _store(Value, Mem);
   5111 }
   5112 
   5113 template <typename TraitsType>
   5114 void TargetX86Base<TraitsType>::copyMemory(Type Ty, Variable *Dest,
   5115                                            Variable *Src, int32_t OffsetAmt) {
   5116   Constant *Offset = OffsetAmt ? Ctx->getConstantInt32(OffsetAmt) : nullptr;
   5117   // TODO(ascull): this or add nullptr test to _movp, _movq
   5118   Variable *Data = makeReg(Ty);
   5119 
   5120   typedLoad(Ty, Data, Src, Offset);
   5121   typedStore(Ty, Data, Dest, Offset);
   5122 }
   5123 
   5124 template <typename TraitsType>
   5125 void TargetX86Base<TraitsType>::lowerMemcpy(Operand *Dest, Operand *Src,
   5126                                             Operand *Count) {
   5127   // There is a load and store for each chunk in the unroll
   5128   constexpr uint32_t BytesPerStorep = 16;
   5129 
   5130   // Check if the operands are constants
   5131   const auto *CountConst = llvm::dyn_cast<const ConstantInteger32>(Count);
   5132   const bool IsCountConst = CountConst != nullptr;
   5133   const uint32_t CountValue = IsCountConst ? CountConst->getValue() : 0;
   5134 
   5135   if (shouldOptimizeMemIntrins() && IsCountConst &&
   5136       CountValue <= BytesPerStorep * Traits::MEMCPY_UNROLL_LIMIT) {
   5137     // Unlikely, but nothing to do if it does happen
   5138     if (CountValue == 0)
   5139       return;
   5140 
   5141     Variable *SrcBase = legalizeToReg(Src);
   5142     Variable *DestBase = legalizeToReg(Dest);
   5143 
   5144     // Find the largest type that can be used and use it as much as possible in
   5145     // reverse order. Then handle any remainder with overlapping copies. Since
   5146     // the remainder will be at the end, there will be reduced pressure on the
   5147     // memory unit as the accesses to the same memory are far apart.
   5148     Type Ty = largestTypeInSize(CountValue);
   5149     uint32_t TyWidth = typeWidthInBytes(Ty);
   5150 
   5151     uint32_t RemainingBytes = CountValue;
   5152     int32_t Offset = (CountValue & ~(TyWidth - 1)) - TyWidth;
   5153     while (RemainingBytes >= TyWidth) {
   5154       copyMemory(Ty, DestBase, SrcBase, Offset);
   5155       RemainingBytes -= TyWidth;
   5156       Offset -= TyWidth;
   5157     }
   5158 
   5159     if (RemainingBytes == 0)
   5160       return;
   5161 
   5162     // Lower the remaining bytes. Adjust to larger types in order to make use
   5163     // of overlaps in the copies.
   5164     Type LeftOverTy = firstTypeThatFitsSize(RemainingBytes);
   5165     Offset = CountValue - typeWidthInBytes(LeftOverTy);
   5166     copyMemory(LeftOverTy, DestBase, SrcBase, Offset);
   5167     return;
   5168   }
   5169 
   5170   // Fall back on a function call
   5171   InstCall *Call = makeHelperCall(RuntimeHelper::H_call_memcpy, nullptr, 3);
   5172   Call->addArg(Dest);
   5173   Call->addArg(Src);
   5174   Call->addArg(Count);
   5175   lowerCall(Call);
   5176 }
   5177 
   5178 template <typename TraitsType>
   5179 void TargetX86Base<TraitsType>::lowerMemmove(Operand *Dest, Operand *Src,
   5180                                              Operand *Count) {
   5181   // There is a load and store for each chunk in the unroll
   5182   constexpr uint32_t BytesPerStorep = 16;
   5183 
   5184   // Check if the operands are constants
   5185   const auto *CountConst = llvm::dyn_cast<const ConstantInteger32>(Count);
   5186   const bool IsCountConst = CountConst != nullptr;
   5187   const uint32_t CountValue = IsCountConst ? CountConst->getValue() : 0;
   5188 
   5189   if (shouldOptimizeMemIntrins() && IsCountConst &&
   5190       CountValue <= BytesPerStorep * Traits::MEMMOVE_UNROLL_LIMIT) {
   5191     // Unlikely, but nothing to do if it does happen
   5192     if (CountValue == 0)
   5193       return;
   5194 
   5195     Variable *SrcBase = legalizeToReg(Src);
   5196     Variable *DestBase = legalizeToReg(Dest);
   5197 
   5198     std::tuple<Type, Constant *, Variable *>
   5199         Moves[Traits::MEMMOVE_UNROLL_LIMIT];
   5200     Constant *Offset;
   5201     Variable *Reg;
   5202 
   5203     // Copy the data into registers as the source and destination could overlap
   5204     // so make sure not to clobber the memory. This also means overlapping
   5205     // moves can be used as we are taking a safe snapshot of the memory.
   5206     Type Ty = largestTypeInSize(CountValue);
   5207     uint32_t TyWidth = typeWidthInBytes(Ty);
   5208 
   5209     uint32_t RemainingBytes = CountValue;
   5210     int32_t OffsetAmt = (CountValue & ~(TyWidth - 1)) - TyWidth;
   5211     size_t N = 0;
   5212     while (RemainingBytes >= TyWidth) {
   5213       assert(N <= Traits::MEMMOVE_UNROLL_LIMIT);
   5214       Offset = Ctx->getConstantInt32(OffsetAmt);
   5215       Reg = makeReg(Ty);
   5216       typedLoad(Ty, Reg, SrcBase, Offset);
   5217       RemainingBytes -= TyWidth;
   5218       OffsetAmt -= TyWidth;
   5219       Moves[N++] = std::make_tuple(Ty, Offset, Reg);
   5220     }
   5221 
   5222     if (RemainingBytes != 0) {
   5223       // Lower the remaining bytes. Adjust to larger types in order to make use
   5224       // of overlaps in the copies.
   5225       assert(N <= Traits::MEMMOVE_UNROLL_LIMIT);
   5226       Ty = firstTypeThatFitsSize(RemainingBytes);
   5227       Offset = Ctx->getConstantInt32(CountValue - typeWidthInBytes(Ty));
   5228       Reg = makeReg(Ty);
   5229       typedLoad(Ty, Reg, SrcBase, Offset);
   5230       Moves[N++] = std::make_tuple(Ty, Offset, Reg);
   5231     }
   5232 
   5233     // Copy the data out into the destination memory
   5234     for (size_t i = 0; i < N; ++i) {
   5235       std::tie(Ty, Offset, Reg) = Moves[i];
   5236       typedStore(Ty, Reg, DestBase, Offset);
   5237     }
   5238 
   5239     return;
   5240   }
   5241 
   5242   // Fall back on a function call
   5243   InstCall *Call = makeHelperCall(RuntimeHelper::H_call_memmove, nullptr, 3);
   5244   Call->addArg(Dest);
   5245   Call->addArg(Src);
   5246   Call->addArg(Count);
   5247   lowerCall(Call);
   5248 }
   5249 
   5250 template <typename TraitsType>
   5251 void TargetX86Base<TraitsType>::lowerMemset(Operand *Dest, Operand *Val,
   5252                                             Operand *Count) {
   5253   constexpr uint32_t BytesPerStorep = 16;
   5254   constexpr uint32_t BytesPerStoreq = 8;
   5255   constexpr uint32_t BytesPerStorei32 = 4;
   5256   assert(Val->getType() == IceType_i8);
   5257 
   5258   // Check if the operands are constants
   5259   const auto *CountConst = llvm::dyn_cast<const ConstantInteger32>(Count);
   5260   const auto *ValConst = llvm::dyn_cast<const ConstantInteger32>(Val);
   5261   const bool IsCountConst = CountConst != nullptr;
   5262   const bool IsValConst = ValConst != nullptr;
   5263   const uint32_t CountValue = IsCountConst ? CountConst->getValue() : 0;
   5264   const uint32_t ValValue = IsValConst ? ValConst->getValue() : 0;
   5265 
   5266   // Unlikely, but nothing to do if it does happen
   5267   if (IsCountConst && CountValue == 0)
   5268     return;
   5269 
   5270   // TODO(ascull): if the count is constant but val is not it would be possible
   5271   // to inline by spreading the value across 4 bytes and accessing subregs e.g.
   5272   // eax, ax and al.
   5273   if (shouldOptimizeMemIntrins() && IsCountConst && IsValConst) {
   5274     Variable *Base = nullptr;
   5275     Variable *VecReg = nullptr;
   5276     const uint32_t MaskValue = (ValValue & 0xff);
   5277     const uint32_t SpreadValue =
   5278         (MaskValue << 24) | (MaskValue << 16) | (MaskValue << 8) | MaskValue;
   5279 
   5280     auto lowerSet = [this, &Base, SpreadValue, &VecReg](Type Ty,
   5281                                                         uint32_t OffsetAmt) {
   5282       assert(Base != nullptr);
   5283       Constant *Offset = OffsetAmt ? Ctx->getConstantInt32(OffsetAmt) : nullptr;
   5284 
   5285       // TODO(ascull): is 64-bit better with vector or scalar movq?
   5286       auto *Mem = X86OperandMem::create(Func, Ty, Base, Offset);
   5287       if (isVectorType(Ty)) {
   5288         assert(VecReg != nullptr);
   5289         _storep(VecReg, Mem);
   5290       } else if (Ty == IceType_f64) {
   5291         assert(VecReg != nullptr);
   5292         _storeq(VecReg, Mem);
   5293       } else {
   5294         assert(Ty != IceType_i64);
   5295         _store(Ctx->getConstantInt(Ty, SpreadValue), Mem);
   5296       }
   5297     };
   5298 
   5299     // Find the largest type that can be used and use it as much as possible in
   5300     // reverse order. Then handle any remainder with overlapping copies. Since
   5301     // the remainder will be at the end, there will be reduces pressure on the
   5302     // memory unit as the access to the same memory are far apart.
   5303     Type Ty;
   5304     if (ValValue == 0 && CountValue >= BytesPerStoreq &&
   5305         CountValue <= BytesPerStorep * Traits::MEMSET_UNROLL_LIMIT) {
   5306       // When the value is zero it can be loaded into a vector register cheaply
   5307       // using the xor trick.
   5308       Base = legalizeToReg(Dest);
   5309       VecReg = makeVectorOfZeros(IceType_v16i8);
   5310       Ty = largestTypeInSize(CountValue);
   5311     } else if (CountValue <= BytesPerStorei32 * Traits::MEMSET_UNROLL_LIMIT) {
   5312       // When the value is non-zero or the count is small we can't use vector
   5313       // instructions so are limited to 32-bit stores.
   5314       Base = legalizeToReg(Dest);
   5315       constexpr uint32_t MaxSize = 4;
   5316       Ty = largestTypeInSize(CountValue, MaxSize);
   5317     }
   5318 
   5319     if (Base) {
   5320       uint32_t TyWidth = typeWidthInBytes(Ty);
   5321 
   5322       uint32_t RemainingBytes = CountValue;
   5323       uint32_t Offset = (CountValue & ~(TyWidth - 1)) - TyWidth;
   5324       while (RemainingBytes >= TyWidth) {
   5325         lowerSet(Ty, Offset);
   5326         RemainingBytes -= TyWidth;
   5327         Offset -= TyWidth;
   5328       }
   5329 
   5330       if (RemainingBytes == 0)
   5331         return;
   5332 
   5333       // Lower the remaining bytes. Adjust to larger types in order to make use
   5334       // of overlaps in the copies.
   5335       Type LeftOverTy = firstTypeThatFitsSize(RemainingBytes);
   5336       Offset = CountValue - typeWidthInBytes(LeftOverTy);
   5337       lowerSet(LeftOverTy, Offset);
   5338       return;
   5339     }
   5340   }
   5341 
   5342   // Fall back on calling the memset function. The value operand needs to be
   5343   // extended to a stack slot size because the PNaCl ABI requires arguments to
   5344   // be at least 32 bits wide.
   5345   Operand *ValExt;
   5346   if (IsValConst) {
   5347     ValExt = Ctx->getConstantInt(stackSlotType(), ValValue);
   5348   } else {
   5349     Variable *ValExtVar = Func->makeVariable(stackSlotType());
   5350     lowerCast(InstCast::create(Func, InstCast::Zext, ValExtVar, Val));
   5351     ValExt = ValExtVar;
   5352   }
   5353   InstCall *Call = makeHelperCall(RuntimeHelper::H_call_memset, nullptr, 3);
   5354   Call->addArg(Dest);
   5355   Call->addArg(ValExt);
   5356   Call->addArg(Count);
   5357   lowerCall(Call);
   5358 }
   5359 
   5360 class AddressOptimizer {
   5361   AddressOptimizer() = delete;
   5362   AddressOptimizer(const AddressOptimizer &) = delete;
   5363   AddressOptimizer &operator=(const AddressOptimizer &) = delete;
   5364 
   5365 public:
   5366   explicit AddressOptimizer(const Cfg *Func)
   5367       : Func(Func), VMetadata(Func->getVMetadata()) {}
   5368 
   5369   inline void dumpAddressOpt(const ConstantRelocatable *const Relocatable,
   5370                              int32_t Offset, const Variable *Base,
   5371                              const Variable *Index, uint16_t Shift,
   5372                              const Inst *Reason) const;
   5373 
   5374   inline const Inst *matchAssign(Variable **Var,
   5375                                  ConstantRelocatable **Relocatable,
   5376                                  int32_t *Offset);
   5377 
   5378   inline const Inst *matchCombinedBaseIndex(Variable **Base, Variable **Index,
   5379                                             uint16_t *Shift);
   5380 
   5381   inline const Inst *matchShiftedIndex(Variable **Index, uint16_t *Shift);
   5382 
   5383   inline const Inst *matchOffsetIndexOrBase(Variable **IndexOrBase,
   5384                                             const uint16_t Shift,
   5385                                             ConstantRelocatable **Relocatable,
   5386                                             int32_t *Offset);
   5387 
   5388 private:
   5389   const Cfg *const Func;
   5390   const VariablesMetadata *const VMetadata;
   5391 
   5392   static bool isAdd(const Inst *Instr) {
   5393     if (auto *Arith = llvm::dyn_cast_or_null<const InstArithmetic>(Instr)) {
   5394       return (Arith->getOp() == InstArithmetic::Add);
   5395     }
   5396     return false;
   5397   }
   5398 };
   5399 
   5400 void AddressOptimizer::dumpAddressOpt(
   5401     const ConstantRelocatable *const Relocatable, int32_t Offset,
   5402     const Variable *Base, const Variable *Index, uint16_t Shift,
   5403     const Inst *Reason) const {
   5404   if (!BuildDefs::dump())
   5405     return;
   5406   if (!Func->isVerbose(IceV_AddrOpt))
   5407     return;
   5408   OstreamLocker L(Func->getContext());
   5409   Ostream &Str = Func->getContext()->getStrDump();
   5410   Str << "Instruction: ";
   5411   Reason->dumpDecorated(Func);
   5412   Str << "  results in Base=";
   5413   if (Base)
   5414     Base->dump(Func);
   5415   else
   5416     Str << "<null>";
   5417   Str << ", Index=";
   5418   if (Index)
   5419     Index->dump(Func);
   5420   else
   5421     Str << "<null>";
   5422   Str << ", Shift=" << Shift << ", Offset=" << Offset
   5423       << ", Relocatable=" << Relocatable << "\n";
   5424 }
   5425 
   5426 const Inst *AddressOptimizer::matchAssign(Variable **Var,
   5427                                           ConstantRelocatable **Relocatable,
   5428                                           int32_t *Offset) {
   5429   // Var originates from Var=SrcVar ==> set Var:=SrcVar
   5430   if (*Var == nullptr)
   5431     return nullptr;
   5432   if (const Inst *VarAssign = VMetadata->getSingleDefinition(*Var)) {
   5433     assert(!VMetadata->isMultiDef(*Var));
   5434     if (llvm::isa<InstAssign>(VarAssign)) {
   5435       Operand *SrcOp = VarAssign->getSrc(0);
   5436       assert(SrcOp);
   5437       if (auto *SrcVar = llvm::dyn_cast<Variable>(SrcOp)) {
   5438         if (!VMetadata->isMultiDef(SrcVar) &&
   5439             // TODO: ensure SrcVar stays single-BB
   5440             true) {
   5441           *Var = SrcVar;
   5442           return VarAssign;
   5443         }
   5444       } else if (auto *Const = llvm::dyn_cast<ConstantInteger32>(SrcOp)) {
   5445         int32_t MoreOffset = Const->getValue();
   5446         if (Utils::WouldOverflowAdd(*Offset, MoreOffset))
   5447           return nullptr;
   5448         *Var = nullptr;
   5449         *Offset += MoreOffset;
   5450         return VarAssign;
   5451       } else if (auto *AddReloc = llvm::dyn_cast<ConstantRelocatable>(SrcOp)) {
   5452         if (*Relocatable == nullptr) {
   5453           // It is always safe to fold a relocatable through assignment -- the
   5454           // assignment frees a slot in the address operand that can be used to
   5455           // hold the Sandbox Pointer -- if any.
   5456           *Var = nullptr;
   5457           *Relocatable = AddReloc;
   5458           return VarAssign;
   5459         }
   5460       }
   5461     }
   5462   }
   5463   return nullptr;
   5464 }
   5465 
   5466 const Inst *AddressOptimizer::matchCombinedBaseIndex(Variable **Base,
   5467                                                      Variable **Index,
   5468                                                      uint16_t *Shift) {
   5469   // Index==nullptr && Base is Base=Var1+Var2 ==>
   5470   //   set Base=Var1, Index=Var2, Shift=0
   5471   if (*Base == nullptr)
   5472     return nullptr;
   5473   if (*Index != nullptr)
   5474     return nullptr;
   5475   auto *BaseInst = VMetadata->getSingleDefinition(*Base);
   5476   if (BaseInst == nullptr)
   5477     return nullptr;
   5478   assert(!VMetadata->isMultiDef(*Base));
   5479   if (BaseInst->getSrcSize() < 2)
   5480     return nullptr;
   5481   if (auto *Var1 = llvm::dyn_cast<Variable>(BaseInst->getSrc(0))) {
   5482     if (VMetadata->isMultiDef(Var1))
   5483       return nullptr;
   5484     if (auto *Var2 = llvm::dyn_cast<Variable>(BaseInst->getSrc(1))) {
   5485       if (VMetadata->isMultiDef(Var2))
   5486         return nullptr;
   5487       if (isAdd(BaseInst) &&
   5488           // TODO: ensure Var1 and Var2 stay single-BB
   5489           true) {
   5490         *Base = Var1;
   5491         *Index = Var2;
   5492         *Shift = 0; // should already have been 0
   5493         return BaseInst;
   5494       }
   5495     }
   5496   }
   5497   return nullptr;
   5498 }
   5499 
   5500 const Inst *AddressOptimizer::matchShiftedIndex(Variable **Index,
   5501                                                 uint16_t *Shift) {
   5502   // Index is Index=Var*Const && log2(Const)+Shift<=3 ==>
   5503   //   Index=Var, Shift+=log2(Const)
   5504   if (*Index == nullptr)
   5505     return nullptr;
   5506   auto *IndexInst = VMetadata->getSingleDefinition(*Index);
   5507   if (IndexInst == nullptr)
   5508     return nullptr;
   5509   assert(!VMetadata->isMultiDef(*Index));
   5510 
   5511   // When using an unsigned 32-bit array index on x64, it gets zero-extended
   5512   // before the shift & add. The explicit zero extension can be eliminated
   5513   // because x86 32-bit operations automatically get zero-extended into the
   5514   // corresponding 64-bit register.
   5515   if (auto *CastInst = llvm::dyn_cast<InstCast>(IndexInst)) {
   5516     if (CastInst->getCastKind() == InstCast::Zext) {
   5517       if (auto *Var = llvm::dyn_cast<Variable>(CastInst->getSrc(0))) {
   5518         if (Var->getType() == IceType_i32 &&
   5519             CastInst->getDest()->getType() == IceType_i64) {
   5520           IndexInst = VMetadata->getSingleDefinition(Var);
   5521         }
   5522       }
   5523     }
   5524   }
   5525 
   5526   if (IndexInst->getSrcSize() < 2)
   5527     return nullptr;
   5528   if (auto *ArithInst = llvm::dyn_cast<InstArithmetic>(IndexInst)) {
   5529     if (auto *Var = llvm::dyn_cast<Variable>(ArithInst->getSrc(0))) {
   5530       if (auto *Const =
   5531               llvm::dyn_cast<ConstantInteger32>(ArithInst->getSrc(1))) {
   5532         if (VMetadata->isMultiDef(Var) || Const->getType() != IceType_i32)
   5533           return nullptr;
   5534         switch (ArithInst->getOp()) {
   5535         default:
   5536           return nullptr;
   5537         case InstArithmetic::Mul: {
   5538           uint32_t Mult = Const->getValue();
   5539           uint32_t LogMult;
   5540           switch (Mult) {
   5541           case 1:
   5542             LogMult = 0;
   5543             break;
   5544           case 2:
   5545             LogMult = 1;
   5546             break;
   5547           case 4:
   5548             LogMult = 2;
   5549             break;
   5550           case 8:
   5551             LogMult = 3;
   5552             break;
   5553           default:
   5554             return nullptr;
   5555           }
   5556           if (*Shift + LogMult <= 3) {
   5557             *Index = Var;
   5558             *Shift += LogMult;
   5559             return IndexInst;
   5560           }
   5561         }
   5562         case InstArithmetic::Shl: {
   5563           uint32_t ShiftAmount = Const->getValue();
   5564           switch (ShiftAmount) {
   5565           case 0:
   5566           case 1:
   5567           case 2:
   5568           case 3:
   5569             break;
   5570           default:
   5571             return nullptr;
   5572           }
   5573           if (*Shift + ShiftAmount <= 3) {
   5574             *Index = Var;
   5575             *Shift += ShiftAmount;
   5576             return IndexInst;
   5577           }
   5578         }
   5579         }
   5580       }
   5581     }
   5582   }
   5583   return nullptr;
   5584 }
   5585 
   5586 const Inst *AddressOptimizer::matchOffsetIndexOrBase(
   5587     Variable **IndexOrBase, const uint16_t Shift,
   5588     ConstantRelocatable **Relocatable, int32_t *Offset) {
   5589   // Base is Base=Var+Const || Base is Base=Const+Var ==>
   5590   //   set Base=Var, Offset+=Const
   5591   // Base is Base=Var-Const ==>
   5592   //   set Base=Var, Offset-=Const
   5593   // Index is Index=Var+Const ==>
   5594   //   set Index=Var, Offset+=(Const<<Shift)
   5595   // Index is Index=Const+Var ==>
   5596   //   set Index=Var, Offset+=(Const<<Shift)
   5597   // Index is Index=Var-Const ==>
   5598   //   set Index=Var, Offset-=(Const<<Shift)
   5599   // Treat Index=Var Or Const as Index=Var + Const
   5600   //    when Var = Var' << N and log2(Const) <= N
   5601   // or when Var = (2^M) * (2^N) and log2(Const) <= (M+N)
   5602 
   5603   if (*IndexOrBase == nullptr) {
   5604     return nullptr;
   5605   }
   5606   const Inst *Definition = VMetadata->getSingleDefinition(*IndexOrBase);
   5607   if (Definition == nullptr) {
   5608     return nullptr;
   5609   }
   5610   assert(!VMetadata->isMultiDef(*IndexOrBase));
   5611   if (auto *ArithInst = llvm::dyn_cast<const InstArithmetic>(Definition)) {
   5612     switch (ArithInst->getOp()) {
   5613     case InstArithmetic::Add:
   5614     case InstArithmetic::Sub:
   5615     case InstArithmetic::Or:
   5616       break;
   5617     default:
   5618       return nullptr;
   5619     }
   5620 
   5621     Operand *Src0 = ArithInst->getSrc(0);
   5622     Operand *Src1 = ArithInst->getSrc(1);
   5623     auto *Var0 = llvm::dyn_cast<Variable>(Src0);
   5624     auto *Var1 = llvm::dyn_cast<Variable>(Src1);
   5625     auto *Const0 = llvm::dyn_cast<ConstantInteger32>(Src0);
   5626     auto *Const1 = llvm::dyn_cast<ConstantInteger32>(Src1);
   5627     auto *Reloc0 = llvm::dyn_cast<ConstantRelocatable>(Src0);
   5628     auto *Reloc1 = llvm::dyn_cast<ConstantRelocatable>(Src1);
   5629 
   5630     bool IsAdd = false;
   5631     if (ArithInst->getOp() == InstArithmetic::Or) {
   5632       Variable *Var = nullptr;
   5633       ConstantInteger32 *Const = nullptr;
   5634       if (Var0 && Const1) {
   5635         Var = Var0;
   5636         Const = Const1;
   5637       } else if (Const0 && Var1) {
   5638         Var = Var1;
   5639         Const = Const0;
   5640       } else {
   5641         return nullptr;
   5642       }
   5643       auto *VarDef =
   5644           llvm::dyn_cast<InstArithmetic>(VMetadata->getSingleDefinition(Var));
   5645       if (VarDef == nullptr)
   5646         return nullptr;
   5647 
   5648       SizeT ZeroesAvailable = 0;
   5649       if (VarDef->getOp() == InstArithmetic::Shl) {
   5650         if (auto *ConstInt =
   5651                 llvm::dyn_cast<ConstantInteger32>(VarDef->getSrc(1))) {
   5652           ZeroesAvailable = ConstInt->getValue();
   5653         }
   5654       } else if (VarDef->getOp() == InstArithmetic::Mul) {
   5655         SizeT PowerOfTwo = 0;
   5656         if (auto *MultConst =
   5657                 llvm::dyn_cast<ConstantInteger32>(VarDef->getSrc(0))) {
   5658           if (llvm::isPowerOf2_32(MultConst->getValue())) {
   5659             PowerOfTwo += MultConst->getValue();
   5660           }
   5661         }
   5662         if (auto *MultConst =
   5663                 llvm::dyn_cast<ConstantInteger32>(VarDef->getSrc(1))) {
   5664           if (llvm::isPowerOf2_32(MultConst->getValue())) {
   5665             PowerOfTwo += MultConst->getValue();
   5666           }
   5667         }
   5668         ZeroesAvailable = llvm::Log2_32(PowerOfTwo) + 1;
   5669       }
   5670       SizeT ZeroesNeeded = llvm::Log2_32(Const->getValue()) + 1;
   5671       if (ZeroesNeeded == 0 || ZeroesNeeded > ZeroesAvailable)
   5672         return nullptr;
   5673       IsAdd = true; // treat it as an add if the above conditions hold
   5674     } else {
   5675       IsAdd = ArithInst->getOp() == InstArithmetic::Add;
   5676     }
   5677 
   5678     Variable *NewIndexOrBase = nullptr;
   5679     int32_t NewOffset = 0;
   5680     ConstantRelocatable *NewRelocatable = *Relocatable;
   5681     if (Var0 && Var1)
   5682       // TODO(sehr): merge base/index splitting into here.
   5683       return nullptr;
   5684     if (!IsAdd && Var1)
   5685       return nullptr;
   5686     if (Var0)
   5687       NewIndexOrBase = Var0;
   5688     else if (Var1)
   5689       NewIndexOrBase = Var1;
   5690     // Don't know how to add/subtract two relocatables.
   5691     if ((*Relocatable && (Reloc0 || Reloc1)) || (Reloc0 && Reloc1))
   5692       return nullptr;
   5693     // Don't know how to subtract a relocatable.
   5694     if (!IsAdd && Reloc1)
   5695       return nullptr;
   5696     // Incorporate ConstantRelocatables.
   5697     if (Reloc0)
   5698       NewRelocatable = Reloc0;
   5699     else if (Reloc1)
   5700       NewRelocatable = Reloc1;
   5701     // Compute the updated constant offset.
   5702     if (Const0) {
   5703       const int32_t MoreOffset =
   5704           IsAdd ? Const0->getValue() : -Const0->getValue();
   5705       if (Utils::WouldOverflowAdd(*Offset + NewOffset, MoreOffset))
   5706         return nullptr;
   5707       NewOffset += MoreOffset;
   5708     }
   5709     if (Const1) {
   5710       const int32_t MoreOffset =
   5711           IsAdd ? Const1->getValue() : -Const1->getValue();
   5712       if (Utils::WouldOverflowAdd(*Offset + NewOffset, MoreOffset))
   5713         return nullptr;
   5714       NewOffset += MoreOffset;
   5715     }
   5716     if (Utils::WouldOverflowAdd(*Offset, NewOffset << Shift))
   5717       return nullptr;
   5718     *IndexOrBase = NewIndexOrBase;
   5719     *Offset += (NewOffset << Shift);
   5720     // Shift is always zero if this is called with the base
   5721     *Relocatable = NewRelocatable;
   5722     return Definition;
   5723   }
   5724   return nullptr;
   5725 }
   5726 
   5727 template <typename TypeTraits>
   5728 typename TargetX86Base<TypeTraits>::X86OperandMem *
   5729 TargetX86Base<TypeTraits>::computeAddressOpt(const Inst *Instr, Type MemType,
   5730                                              Operand *Addr) {
   5731   Func->resetCurrentNode();
   5732   if (Func->isVerbose(IceV_AddrOpt)) {
   5733     OstreamLocker L(Func->getContext());
   5734     Ostream &Str = Func->getContext()->getStrDump();
   5735     Str << "\nStarting computeAddressOpt for instruction:\n  ";
   5736     Instr->dumpDecorated(Func);
   5737   }
   5738 
   5739   OptAddr NewAddr;
   5740   NewAddr.Base = llvm::dyn_cast<Variable>(Addr);
   5741   if (NewAddr.Base == nullptr)
   5742     return nullptr;
   5743 
   5744   // If the Base has more than one use or is live across multiple blocks, then
   5745   // don't go further. Alternatively (?), never consider a transformation that
   5746   // would change a variable that is currently *not* live across basic block
   5747   // boundaries into one that *is*.
   5748   if (!getFlags().getLoopInvariantCodeMotion()) {
   5749     // Need multi block address opt when licm is enabled.
   5750     // Might make sense to restrict to current node and loop header.
   5751     if (Func->getVMetadata()->isMultiBlock(
   5752             NewAddr.Base) /* || Base->getUseCount() > 1*/)
   5753       return nullptr;
   5754   }
   5755   AddressOptimizer AddrOpt(Func);
   5756   const bool MockBounds = getFlags().getMockBoundsCheck();
   5757   const Inst *Reason = nullptr;
   5758   bool AddressWasOptimized = false;
   5759   // The following unnamed struct identifies the address mode formation steps
   5760   // that could potentially create an invalid memory operand (i.e., no free
   5761   // slots for RebasePtr.) We add all those variables to this struct so that we
   5762   // can use memset() to reset all members to false.
   5763   struct {
   5764     bool AssignBase = false;
   5765     bool AssignIndex = false;
   5766     bool OffsetFromBase = false;
   5767     bool OffsetFromIndex = false;
   5768     bool CombinedBaseIndex = false;
   5769   } Skip;
   5770   // This points to the boolean in Skip that represents the last folding
   5771   // performed. This is used to disable a pattern match that generated an
   5772   // invalid address. Without this, the algorithm would never finish.
   5773   bool *SkipLastFolding = nullptr;
   5774   // NewAddrCheckpoint is used to rollback the address being formed in case an
   5775   // invalid address is formed.
   5776   OptAddr NewAddrCheckpoint;
   5777   Reason = Instr;
   5778   do {
   5779     if (SandboxingType != ST_None) {
   5780       // When sandboxing, we defer the sandboxing of NewAddr to the Concrete
   5781       // Target. If our optimization was overly aggressive, then we simply undo
   5782       // what the previous iteration did, and set the previous pattern's skip
   5783       // bit to true.
   5784       if (!legalizeOptAddrForSandbox(&NewAddr)) {
   5785         *SkipLastFolding = true;
   5786         SkipLastFolding = nullptr;
   5787         NewAddr = NewAddrCheckpoint;
   5788         Reason = nullptr;
   5789       }
   5790     }
   5791 
   5792     if (Reason) {
   5793       AddrOpt.dumpAddressOpt(NewAddr.Relocatable, NewAddr.Offset, NewAddr.Base,
   5794                              NewAddr.Index, NewAddr.Shift, Reason);
   5795       AddressWasOptimized = true;
   5796       Reason = nullptr;
   5797       SkipLastFolding = nullptr;
   5798       memset(&Skip, 0, sizeof(Skip));
   5799     }
   5800 
   5801     NewAddrCheckpoint = NewAddr;
   5802 
   5803     // Update Base and Index to follow through assignments to definitions.
   5804     if (!Skip.AssignBase &&
   5805         (Reason = AddrOpt.matchAssign(&NewAddr.Base, &NewAddr.Relocatable,
   5806                                       &NewAddr.Offset))) {
   5807       SkipLastFolding = &Skip.AssignBase;
   5808       // Assignments of Base from a Relocatable or ConstantInt32 can result
   5809       // in Base becoming nullptr.  To avoid code duplication in this loop we
   5810       // prefer that Base be non-nullptr if possible.
   5811       if ((NewAddr.Base == nullptr) && (NewAddr.Index != nullptr) &&
   5812           NewAddr.Shift == 0) {
   5813         std::swap(NewAddr.Base, NewAddr.Index);
   5814       }
   5815       continue;
   5816     }
   5817     if (!Skip.AssignBase &&
   5818         (Reason = AddrOpt.matchAssign(&NewAddr.Index, &NewAddr.Relocatable,
   5819                                       &NewAddr.Offset))) {
   5820       SkipLastFolding = &Skip.AssignIndex;
   5821       continue;
   5822     }
   5823 
   5824     if (!MockBounds) {
   5825       // Transition from:
   5826       //   <Relocatable + Offset>(Base) to
   5827       //   <Relocatable + Offset>(Base, Index)
   5828       if (!Skip.CombinedBaseIndex &&
   5829           (Reason = AddrOpt.matchCombinedBaseIndex(
   5830                &NewAddr.Base, &NewAddr.Index, &NewAddr.Shift))) {
   5831         SkipLastFolding = &Skip.CombinedBaseIndex;
   5832         continue;
   5833       }
   5834 
   5835       // Recognize multiply/shift and update Shift amount.
   5836       // Index becomes Index=Var<<Const && Const+Shift<=3 ==>
   5837       //   Index=Var, Shift+=Const
   5838       // Index becomes Index=Const*Var && log2(Const)+Shift<=3 ==>
   5839       //   Index=Var, Shift+=log2(Const)
   5840       if ((Reason =
   5841                AddrOpt.matchShiftedIndex(&NewAddr.Index, &NewAddr.Shift))) {
   5842         continue;
   5843       }
   5844 
   5845       // If Shift is zero, the choice of Base and Index was purely arbitrary.
   5846       // Recognize multiply/shift and set Shift amount.
   5847       // Shift==0 && Base is Base=Var*Const && log2(Const)+Shift<=3 ==>
   5848       //   swap(Index,Base)
   5849       // Similar for Base=Const*Var and Base=Var<<Const
   5850       if (NewAddr.Shift == 0 &&
   5851           (Reason = AddrOpt.matchShiftedIndex(&NewAddr.Base, &NewAddr.Shift))) {
   5852         std::swap(NewAddr.Base, NewAddr.Index);
   5853         continue;
   5854       }
   5855     }
   5856 
   5857     // Update Offset to reflect additions/subtractions with constants and
   5858     // relocatables.
   5859     // TODO: consider overflow issues with respect to Offset.
   5860     if (!Skip.OffsetFromBase && (Reason = AddrOpt.matchOffsetIndexOrBase(
   5861                                      &NewAddr.Base, /*Shift =*/0,
   5862                                      &NewAddr.Relocatable, &NewAddr.Offset))) {
   5863       SkipLastFolding = &Skip.OffsetFromBase;
   5864       continue;
   5865     }
   5866     if (!Skip.OffsetFromIndex && (Reason = AddrOpt.matchOffsetIndexOrBase(
   5867                                       &NewAddr.Index, NewAddr.Shift,
   5868                                       &NewAddr.Relocatable, &NewAddr.Offset))) {
   5869       SkipLastFolding = &Skip.OffsetFromIndex;
   5870       continue;
   5871     }
   5872 
   5873     break;
   5874   } while (Reason);
   5875 
   5876   if (!AddressWasOptimized) {
   5877     return nullptr;
   5878   }
   5879 
   5880   // Undo any addition of RebasePtr.  It will be added back when the mem
   5881   // operand is sandboxed.
   5882   if (NewAddr.Base == RebasePtr) {
   5883     NewAddr.Base = nullptr;
   5884   }
   5885 
   5886   if (NewAddr.Index == RebasePtr) {
   5887     NewAddr.Index = nullptr;
   5888     NewAddr.Shift = 0;
   5889   }
   5890 
   5891   Constant *OffsetOp = nullptr;
   5892   if (NewAddr.Relocatable == nullptr) {
   5893     OffsetOp = Ctx->getConstantInt32(NewAddr.Offset);
   5894   } else {
   5895     OffsetOp =
   5896         Ctx->getConstantSym(NewAddr.Relocatable->getOffset() + NewAddr.Offset,
   5897                             NewAddr.Relocatable->getName());
   5898   }
   5899   // Vanilla ICE load instructions should not use the segment registers, and
   5900   // computeAddressOpt only works at the level of Variables and Constants, not
   5901   // other X86OperandMem, so there should be no mention of segment
   5902   // registers there either.
   5903   static constexpr auto SegmentReg =
   5904       X86OperandMem::SegmentRegisters::DefaultSegment;
   5905 
   5906   return X86OperandMem::create(Func, MemType, NewAddr.Base, OffsetOp,
   5907                                NewAddr.Index, NewAddr.Shift, SegmentReg);
   5908 }
   5909 
   5910 /// Add a mock bounds check on the memory address before using it as a load or
   5911 /// store operand.  The basic idea is that given a memory operand [reg], we
   5912 /// would first add bounds-check code something like:
   5913 ///
   5914 ///   cmp reg, <lb>
   5915 ///   jl out_of_line_error
   5916 ///   cmp reg, <ub>
   5917 ///   jg out_of_line_error
   5918 ///
   5919 /// In reality, the specific code will depend on how <lb> and <ub> are
   5920 /// represented, e.g. an immediate, a global, or a function argument.
   5921 ///
   5922 /// As such, we need to enforce that the memory operand does not have the form
   5923 /// [reg1+reg2], because then there is no simple cmp instruction that would
   5924 /// suffice.  However, we consider [reg+offset] to be OK because the offset is
   5925 /// usually small, and so <ub> could have a safety buffer built in and then we
   5926 /// could instead branch to a custom out_of_line_error that does the precise
   5927 /// check and jumps back if it turns out OK.
   5928 ///
   5929 /// For the purpose of mocking the bounds check, we'll do something like this:
   5930 ///
   5931 ///   cmp reg, 0
   5932 ///   je label
   5933 ///   cmp reg, 1
   5934 ///   je label
   5935 ///   label:
   5936 ///
   5937 /// Also note that we don't need to add a bounds check to a dereference of a
   5938 /// simple global variable address.
   5939 template <typename TraitsType>
   5940 void TargetX86Base<TraitsType>::doMockBoundsCheck(Operand *Opnd) {
   5941   if (!getFlags().getMockBoundsCheck())
   5942     return;
   5943   if (auto *Mem = llvm::dyn_cast<X86OperandMem>(Opnd)) {
   5944     if (Mem->getIndex()) {
   5945       llvm::report_fatal_error("doMockBoundsCheck: Opnd contains index reg");
   5946     }
   5947     Opnd = Mem->getBase();
   5948   }
   5949   // At this point Opnd could be nullptr, or Variable, or Constant, or perhaps
   5950   // something else.  We only care if it is Variable.
   5951   auto *Var = llvm::dyn_cast_or_null<Variable>(Opnd);
   5952   if (Var == nullptr)
   5953     return;
   5954   // We use lowerStore() to copy out-args onto the stack.  This creates a memory
   5955   // operand with the stack pointer as the base register.  Don't do bounds
   5956   // checks on that.
   5957   if (Var->getRegNum() == getStackReg())
   5958     return;
   5959 
   5960   auto *Label = InstX86Label::create(Func, this);
   5961   _cmp(Opnd, Ctx->getConstantZero(IceType_i32));
   5962   _br(Traits::Cond::Br_e, Label);
   5963   _cmp(Opnd, Ctx->getConstantInt32(1));
   5964   _br(Traits::Cond::Br_e, Label);
   5965   Context.insert(Label);
   5966 }
   5967 
   5968 template <typename TraitsType>
   5969 void TargetX86Base<TraitsType>::lowerLoad(const InstLoad *Load) {
   5970   // A Load instruction can be treated the same as an Assign instruction, after
   5971   // the source operand is transformed into an X86OperandMem operand.  Note that
   5972   // the address mode optimization already creates an X86OperandMem operand, so
   5973   // it doesn't need another level of transformation.
   5974   Variable *DestLoad = Load->getDest();
   5975   Type Ty = DestLoad->getType();
   5976   Operand *Src0 = formMemoryOperand(Load->getSourceAddress(), Ty);
   5977   doMockBoundsCheck(Src0);
   5978   auto *Assign = InstAssign::create(Func, DestLoad, Src0);
   5979   lowerAssign(Assign);
   5980 }
   5981 
   5982 template <typename TraitsType>
   5983 void TargetX86Base<TraitsType>::doAddressOptOther() {
   5984   // Inverts some Icmp instructions which helps doAddressOptLoad later.
   5985   // TODO(manasijm): Refactor to unify the conditions for Var0 and Var1
   5986   Inst *Instr = iteratorToInst(Context.getCur());
   5987   auto *VMetadata = Func->getVMetadata();
   5988   if (auto *Icmp = llvm::dyn_cast<InstIcmp>(Instr)) {
   5989     if (llvm::isa<Constant>(Icmp->getSrc(0)) ||
   5990         llvm::isa<Constant>(Icmp->getSrc(1)))
   5991       return;
   5992     auto *Var0 = llvm::dyn_cast<Variable>(Icmp->getSrc(0));
   5993     if (Var0 == nullptr)
   5994       return;
   5995     if (!VMetadata->isTracked(Var0))
   5996       return;
   5997     auto *Op0Def = VMetadata->getFirstDefinitionSingleBlock(Var0);
   5998     if (Op0Def == nullptr || !llvm::isa<InstLoad>(Op0Def))
   5999       return;
   6000     if (VMetadata->getLocalUseNode(Var0) != Context.getNode())
   6001       return;
   6002 
   6003     auto *Var1 = llvm::dyn_cast<Variable>(Icmp->getSrc(1));
   6004     if (Var1 != nullptr && VMetadata->isTracked(Var1)) {
   6005       auto *Op1Def = VMetadata->getFirstDefinitionSingleBlock(Var1);
   6006       if (Op1Def != nullptr && !VMetadata->isMultiBlock(Var1) &&
   6007           llvm::isa<InstLoad>(Op1Def)) {
   6008         return; // Both are loads
   6009       }
   6010     }
   6011     Icmp->reverseConditionAndOperands();
   6012   }
   6013 }
   6014 
   6015 template <typename TraitsType>
   6016 void TargetX86Base<TraitsType>::doAddressOptLoad() {
   6017   Inst *Instr = iteratorToInst(Context.getCur());
   6018   Operand *Addr = Instr->getSrc(0);
   6019   Variable *Dest = Instr->getDest();
   6020   if (auto *OptAddr = computeAddressOpt(Instr, Dest->getType(), Addr)) {
   6021     Instr->setDeleted();
   6022     Context.insert<InstLoad>(Dest, OptAddr);
   6023   }
   6024 }
   6025 
   6026 template <typename TraitsType>
   6027 void TargetX86Base<TraitsType>::doAddressOptLoadSubVector() {
   6028   auto *Intrinsic = llvm::cast<InstIntrinsicCall>(Context.getCur());
   6029   Operand *Addr = Intrinsic->getArg(0);
   6030   Variable *Dest = Intrinsic->getDest();
   6031   if (auto *OptAddr = computeAddressOpt(Intrinsic, Dest->getType(), Addr)) {
   6032     Intrinsic->setDeleted();
   6033     const Ice::Intrinsics::IntrinsicInfo Info = {
   6034         Ice::Intrinsics::LoadSubVector, Ice::Intrinsics::SideEffects_F,
   6035         Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F};
   6036     auto Target = Ctx->getConstantUndef(Ice::IceType_i32);
   6037     auto *NewLoad = Context.insert<InstIntrinsicCall>(2, Dest, Target, Info);
   6038     NewLoad->addArg(OptAddr);
   6039     NewLoad->addArg(Intrinsic->getArg(1));
   6040   }
   6041 }
   6042 
   6043 template <typename TraitsType>
   6044 void TargetX86Base<TraitsType>::randomlyInsertNop(float Probability,
   6045                                                   RandomNumberGenerator &RNG) {
   6046   RandomNumberGeneratorWrapper RNGW(RNG);
   6047   if (RNGW.getTrueWithProbability(Probability)) {
   6048     _nop(RNGW(Traits::X86_NUM_NOP_VARIANTS));
   6049   }
   6050 }
   6051 
   6052 template <typename TraitsType>
   6053 void TargetX86Base<TraitsType>::lowerPhi(const InstPhi * /*Instr*/) {
   6054   Func->setError("Phi found in regular instruction list");
   6055 }
   6056 
   6057 template <typename TraitsType>
   6058 void TargetX86Base<TraitsType>::lowerRet(const InstRet *Instr) {
   6059   Variable *Reg = nullptr;
   6060   if (Instr->hasRetValue()) {
   6061     Operand *RetValue = legalize(Instr->getRetValue());
   6062     const Type ReturnType = RetValue->getType();
   6063     assert(isVectorType(ReturnType) || isScalarFloatingType(ReturnType) ||
   6064            (ReturnType == IceType_i32) || (ReturnType == IceType_i64));
   6065     Reg = moveReturnValueToRegister(RetValue, ReturnType);
   6066   }
   6067   // Add a ret instruction even if sandboxing is enabled, because addEpilog
   6068   // explicitly looks for a ret instruction as a marker for where to insert the
   6069   // frame removal instructions.
   6070   _ret(Reg);
   6071   // Add a fake use of esp to make sure esp stays alive for the entire
   6072   // function. Otherwise post-call esp adjustments get dead-code eliminated.
   6073   keepEspLiveAtExit();
   6074 }
   6075 
   6076 inline uint32_t makePshufdMask(SizeT Index0, SizeT Index1, SizeT Index2,
   6077                                SizeT Index3) {
   6078   const SizeT Mask = (Index0 & 0x3) | ((Index1 & 0x3) << 2) |
   6079                      ((Index2 & 0x3) << 4) | ((Index3 & 0x3) << 6);
   6080   assert(Mask < 256);
   6081   return Mask;
   6082 }
   6083 
   6084 template <typename TraitsType>
   6085 Variable *TargetX86Base<TraitsType>::lowerShuffleVector_AllFromSameSrc(
   6086     Operand *Src, SizeT Index0, SizeT Index1, SizeT Index2, SizeT Index3) {
   6087   constexpr SizeT SrcBit = 1 << 2;
   6088   assert((Index0 & SrcBit) == (Index1 & SrcBit));
   6089   assert((Index0 & SrcBit) == (Index2 & SrcBit));
   6090   assert((Index0 & SrcBit) == (Index3 & SrcBit));
   6091   (void)SrcBit;
   6092 
   6093   const Type SrcTy = Src->getType();
   6094   auto *T = makeReg(SrcTy);
   6095   auto *SrcRM = legalize(Src, Legal_Reg | Legal_Mem);
   6096   auto *Mask =
   6097       Ctx->getConstantInt32(makePshufdMask(Index0, Index1, Index2, Index3));
   6098   _pshufd(T, SrcRM, Mask);
   6099   return T;
   6100 }
   6101 
   6102 template <typename TraitsType>
   6103 Variable *TargetX86Base<TraitsType>::lowerShuffleVector_TwoFromSameSrc(
   6104     Operand *Src0, SizeT Index0, SizeT Index1, Operand *Src1, SizeT Index2,
   6105     SizeT Index3) {
   6106   constexpr SizeT SrcBit = 1 << 2;
   6107   assert((Index0 & SrcBit) == (Index1 & SrcBit) || (Index1 == IGNORE_INDEX));
   6108   assert((Index2 & SrcBit) == (Index3 & SrcBit) || (Index3 == IGNORE_INDEX));
   6109   (void)SrcBit;
   6110 
   6111   const Type SrcTy = Src0->getType();
   6112   assert(Src1->getType() == SrcTy);
   6113   auto *T = makeReg(SrcTy);
   6114   auto *Src0R = legalizeToReg(Src0);
   6115   auto *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
   6116   auto *Mask =
   6117       Ctx->getConstantInt32(makePshufdMask(Index0, Index1, Index2, Index3));
   6118   _movp(T, Src0R);
   6119   _shufps(T, Src1RM, Mask);
   6120   return T;
   6121 }
   6122 
   6123 template <typename TraitsType>
   6124 Variable *TargetX86Base<TraitsType>::lowerShuffleVector_UnifyFromDifferentSrcs(
   6125     Operand *Src0, SizeT Index0, Operand *Src1, SizeT Index1) {
   6126   return lowerShuffleVector_TwoFromSameSrc(Src0, Index0, IGNORE_INDEX, Src1,
   6127                                            Index1, IGNORE_INDEX);
   6128 }
   6129 
   6130 inline SizeT makeSrcSwitchMask(SizeT Index0, SizeT Index1, SizeT Index2,
   6131                                SizeT Index3) {
   6132   constexpr SizeT SrcBit = 1 << 2;
   6133   const SizeT Index0Bits = ((Index0 & SrcBit) == 0) ? 0 : (1 << 0);
   6134   const SizeT Index1Bits = ((Index1 & SrcBit) == 0) ? 0 : (1 << 1);
   6135   const SizeT Index2Bits = ((Index2 & SrcBit) == 0) ? 0 : (1 << 2);
   6136   const SizeT Index3Bits = ((Index3 & SrcBit) == 0) ? 0 : (1 << 3);
   6137   return Index0Bits | Index1Bits | Index2Bits | Index3Bits;
   6138 }
   6139 
   6140 template <typename TraitsType>
   6141 GlobalString TargetX86Base<TraitsType>::lowerShuffleVector_NewMaskName() {
   6142   GlobalString FuncName = Func->getFunctionName();
   6143   const SizeT Id = PshufbMaskCount++;
   6144   if (!BuildDefs::dump() || !FuncName.hasStdString()) {
   6145     return GlobalString::createWithString(
   6146         Ctx,
   6147         "$PS" + std::to_string(FuncName.getID()) + "_" + std::to_string(Id));
   6148   }
   6149   return GlobalString::createWithString(
   6150       Ctx, "Pshufb$" + Func->getFunctionName() + "$" + std::to_string(Id));
   6151 }
   6152 
   6153 template <typename TraitsType>
   6154 ConstantRelocatable *
   6155 TargetX86Base<TraitsType>::lowerShuffleVector_CreatePshufbMask(
   6156     int8_t Idx0, int8_t Idx1, int8_t Idx2, int8_t Idx3, int8_t Idx4,
   6157     int8_t Idx5, int8_t Idx6, int8_t Idx7, int8_t Idx8, int8_t Idx9,
   6158     int8_t Idx10, int8_t Idx11, int8_t Idx12, int8_t Idx13, int8_t Idx14,
   6159     int8_t Idx15) {
   6160   static constexpr uint8_t NumElements = 16;
   6161   const char Initializer[NumElements] = {
   6162       Idx0, Idx1, Idx2,  Idx3,  Idx4,  Idx5,  Idx6,  Idx7,
   6163       Idx8, Idx9, Idx10, Idx11, Idx12, Idx13, Idx14, Idx15,
   6164   };
   6165 
   6166   static constexpr Type V4VectorType = IceType_v4i32;
   6167   const uint32_t MaskAlignment = typeWidthInBytesOnStack(V4VectorType);
   6168   auto *Mask = VariableDeclaration::create(Func->getGlobalPool());
   6169   GlobalString MaskName = lowerShuffleVector_NewMaskName();
   6170   Mask->setIsConstant(true);
   6171   Mask->addInitializer(VariableDeclaration::DataInitializer::create(
   6172       Func->getGlobalPool(), Initializer, NumElements));
   6173   Mask->setName(MaskName);
   6174   // Mask needs to be 16-byte aligned, or pshufb will seg fault.
   6175   Mask->setAlignment(MaskAlignment);
   6176   Func->addGlobal(Mask);
   6177 
   6178   constexpr RelocOffsetT Offset = 0;
   6179   return llvm::cast<ConstantRelocatable>(Ctx->getConstantSym(Offset, MaskName));
   6180 }
   6181 
   6182 template <typename TraitsType>
   6183 void TargetX86Base<TraitsType>::lowerShuffleVector_UsingPshufb(
   6184     Variable *Dest, Operand *Src0, Operand *Src1, int8_t Idx0, int8_t Idx1,
   6185     int8_t Idx2, int8_t Idx3, int8_t Idx4, int8_t Idx5, int8_t Idx6,
   6186     int8_t Idx7, int8_t Idx8, int8_t Idx9, int8_t Idx10, int8_t Idx11,
   6187     int8_t Idx12, int8_t Idx13, int8_t Idx14, int8_t Idx15) {
   6188   const Type DestTy = Dest->getType();
   6189   static constexpr bool NotRebased = false;
   6190   static constexpr Variable *NoBase = nullptr;
   6191   // We use void for the memory operand instead of DestTy because using the
   6192   // latter causes a validation failure: the X86 Inst layer complains that
   6193   // vector mem operands could be under aligned. Thus, using void we avoid the
   6194   // validation error. Note that the mask global declaration is aligned, so it
   6195   // can be used as an XMM mem operand.
   6196   static constexpr Type MaskType = IceType_void;
   6197 #define IDX_IN_SRC(N, S)                                                       \
   6198   ((((N) & (1 << 4)) == (S << 4)) ? ((N)&0xf) : CLEAR_ALL_BITS)
   6199   auto *Mask0M = X86OperandMem::create(
   6200       Func, MaskType, NoBase,
   6201       lowerShuffleVector_CreatePshufbMask(
   6202           IDX_IN_SRC(Idx0, 0), IDX_IN_SRC(Idx1, 0), IDX_IN_SRC(Idx2, 0),
   6203           IDX_IN_SRC(Idx3, 0), IDX_IN_SRC(Idx4, 0), IDX_IN_SRC(Idx5, 0),
   6204           IDX_IN_SRC(Idx6, 0), IDX_IN_SRC(Idx7, 0), IDX_IN_SRC(Idx8, 0),
   6205           IDX_IN_SRC(Idx9, 0), IDX_IN_SRC(Idx10, 0), IDX_IN_SRC(Idx11, 0),
   6206           IDX_IN_SRC(Idx12, 0), IDX_IN_SRC(Idx13, 0), IDX_IN_SRC(Idx14, 0),
   6207           IDX_IN_SRC(Idx15, 0)),
   6208       NotRebased);
   6209 
   6210   auto *T0 = makeReg(DestTy);
   6211   auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
   6212   _movp(T0, Src0RM);
   6213 
   6214   _pshufb(T0, Mask0M);
   6215 
   6216   if (Idx0 >= 16 || Idx1 >= 16 || Idx2 >= 16 || Idx3 >= 16 || Idx4 >= 16 ||
   6217       Idx5 >= 16 || Idx6 >= 16 || Idx7 >= 16 || Idx8 >= 16 || Idx9 >= 16 ||
   6218       Idx10 >= 16 || Idx11 >= 16 || Idx12 >= 16 || Idx13 >= 16 || Idx14 >= 16 ||
   6219       Idx15 >= 16) {
   6220     auto *Mask1M = X86OperandMem::create(
   6221         Func, MaskType, NoBase,
   6222         lowerShuffleVector_CreatePshufbMask(
   6223             IDX_IN_SRC(Idx0, 1), IDX_IN_SRC(Idx1, 1), IDX_IN_SRC(Idx2, 1),
   6224             IDX_IN_SRC(Idx3, 1), IDX_IN_SRC(Idx4, 1), IDX_IN_SRC(Idx5, 1),
   6225             IDX_IN_SRC(Idx6, 1), IDX_IN_SRC(Idx7, 1), IDX_IN_SRC(Idx8, 1),
   6226             IDX_IN_SRC(Idx9, 1), IDX_IN_SRC(Idx10, 1), IDX_IN_SRC(Idx11, 1),
   6227             IDX_IN_SRC(Idx12, 1), IDX_IN_SRC(Idx13, 1), IDX_IN_SRC(Idx14, 1),
   6228             IDX_IN_SRC(Idx15, 1)),
   6229         NotRebased);
   6230 #undef IDX_IN_SRC
   6231     auto *T1 = makeReg(DestTy);
   6232     auto *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
   6233     _movp(T1, Src1RM);
   6234     _pshufb(T1, Mask1M);
   6235     _por(T0, T1);
   6236   }
   6237 
   6238   _movp(Dest, T0);
   6239 }
   6240 
   6241 template <typename TraitsType>
   6242 void TargetX86Base<TraitsType>::lowerShuffleVector(
   6243     const InstShuffleVector *Instr) {
   6244   auto *Dest = Instr->getDest();
   6245   const Type DestTy = Dest->getType();
   6246   auto *Src0 = Instr->getSrc(0);
   6247   auto *Src1 = Instr->getSrc(1);
   6248   const SizeT NumElements = typeNumElements(DestTy);
   6249 
   6250   auto *T = makeReg(DestTy);
   6251 
   6252   switch (DestTy) {
   6253   default:
   6254     llvm::report_fatal_error("Unexpected vector type.");
   6255   case IceType_v16i1:
   6256   case IceType_v16i8: {
   6257     static constexpr SizeT ExpectedNumElements = 16;
   6258     assert(ExpectedNumElements == Instr->getNumIndexes());
   6259     (void)ExpectedNumElements;
   6260 
   6261     if (Instr->indexesAre(0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7)) {
   6262       auto *T = makeReg(DestTy);
   6263       auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
   6264       _movp(T, Src0RM);
   6265       _punpckl(T, Src0RM);
   6266       _movp(Dest, T);
   6267       return;
   6268     }
   6269 
   6270     if (Instr->indexesAre(0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7,
   6271                           23)) {
   6272       auto *T = makeReg(DestTy);
   6273       auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
   6274       auto *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
   6275       _movp(T, Src0RM);
   6276       _punpckl(T, Src1RM);
   6277       _movp(Dest, T);
   6278       return;
   6279     }
   6280 
   6281     if (Instr->indexesAre(8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14,
   6282                           15, 15)) {
   6283       auto *T = makeReg(DestTy);
   6284       auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
   6285       _movp(T, Src0RM);
   6286       _punpckh(T, Src0RM);
   6287       _movp(Dest, T);
   6288       return;
   6289     }
   6290 
   6291     if (Instr->indexesAre(8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30,
   6292                           15, 31)) {
   6293       auto *T = makeReg(DestTy);
   6294       auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
   6295       auto *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
   6296       _movp(T, Src0RM);
   6297       _punpckh(T, Src1RM);
   6298       _movp(Dest, T);
   6299       return;
   6300     }
   6301 
   6302     if (InstructionSet < Traits::SSE4_1) {
   6303       // TODO(jpp): figure out how to lower with sse2.
   6304       break;
   6305     }
   6306 
   6307     const SizeT Index0 = Instr->getIndexValue(0);
   6308     const SizeT Index1 = Instr->getIndexValue(1);
   6309     const SizeT Index2 = Instr->getIndexValue(2);
   6310     const SizeT Index3 = Instr->getIndexValue(3);
   6311     const SizeT Index4 = Instr->getIndexValue(4);
   6312     const SizeT Index5 = Instr->getIndexValue(5);
   6313     const SizeT Index6 = Instr->getIndexValue(6);
   6314     const SizeT Index7 = Instr->getIndexValue(7);
   6315     const SizeT Index8 = Instr->getIndexValue(8);
   6316     const SizeT Index9 = Instr->getIndexValue(9);
   6317     const SizeT Index10 = Instr->getIndexValue(10);
   6318     const SizeT Index11 = Instr->getIndexValue(11);
   6319     const SizeT Index12 = Instr->getIndexValue(12);
   6320     const SizeT Index13 = Instr->getIndexValue(13);
   6321     const SizeT Index14 = Instr->getIndexValue(14);
   6322     const SizeT Index15 = Instr->getIndexValue(15);
   6323 
   6324     lowerShuffleVector_UsingPshufb(Dest, Src0, Src1, Index0, Index1, Index2,
   6325                                    Index3, Index4, Index5, Index6, Index7,
   6326                                    Index8, Index9, Index10, Index11, Index12,
   6327                                    Index13, Index14, Index15);
   6328     return;
   6329   }
   6330   case IceType_v8i1:
   6331   case IceType_v8i16: {
   6332     static constexpr SizeT ExpectedNumElements = 8;
   6333     assert(ExpectedNumElements == Instr->getNumIndexes());
   6334     (void)ExpectedNumElements;
   6335 
   6336     if (Instr->indexesAre(0, 0, 1, 1, 2, 2, 3, 3)) {
   6337       auto *T = makeReg(DestTy);
   6338       auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
   6339       _movp(T, Src0RM);
   6340       _punpckl(T, Src0RM);
   6341       _movp(Dest, T);
   6342       return;
   6343     }
   6344 
   6345     if (Instr->indexesAre(0, 8, 1, 9, 2, 10, 3, 11)) {
   6346       auto *T = makeReg(DestTy);
   6347       auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
   6348       auto *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
   6349       _movp(T, Src0RM);
   6350       _punpckl(T, Src1RM);
   6351       _movp(Dest, T);
   6352       return;
   6353     }
   6354 
   6355     if (Instr->indexesAre(4, 4, 5, 5, 6, 6, 7, 7)) {
   6356       auto *T = makeReg(DestTy);
   6357       auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
   6358       _movp(T, Src0RM);
   6359       _punpckh(T, Src0RM);
   6360       _movp(Dest, T);
   6361       return;
   6362     }
   6363 
   6364     if (Instr->indexesAre(4, 12, 5, 13, 6, 14, 7, 15)) {
   6365       auto *T = makeReg(DestTy);
   6366       auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
   6367       auto *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
   6368       _movp(T, Src0RM);
   6369       _punpckh(T, Src1RM);
   6370       _movp(Dest, T);
   6371       return;
   6372     }
   6373 
   6374     if (InstructionSet < Traits::SSE4_1) {
   6375       // TODO(jpp): figure out how to lower with sse2.
   6376       break;
   6377     }
   6378 
   6379     const SizeT Index0 = Instr->getIndexValue(0);
   6380     const SizeT Index1 = Instr->getIndexValue(1);
   6381     const SizeT Index2 = Instr->getIndexValue(2);
   6382     const SizeT Index3 = Instr->getIndexValue(3);
   6383     const SizeT Index4 = Instr->getIndexValue(4);
   6384     const SizeT Index5 = Instr->getIndexValue(5);
   6385     const SizeT Index6 = Instr->getIndexValue(6);
   6386     const SizeT Index7 = Instr->getIndexValue(7);
   6387 
   6388 #define TO_BYTE_INDEX(I) ((I) << 1)
   6389     lowerShuffleVector_UsingPshufb(
   6390         Dest, Src0, Src1, TO_BYTE_INDEX(Index0), TO_BYTE_INDEX(Index0) + 1,
   6391         TO_BYTE_INDEX(Index1), TO_BYTE_INDEX(Index1) + 1, TO_BYTE_INDEX(Index2),
   6392         TO_BYTE_INDEX(Index2) + 1, TO_BYTE_INDEX(Index3),
   6393         TO_BYTE_INDEX(Index3) + 1, TO_BYTE_INDEX(Index4),
   6394         TO_BYTE_INDEX(Index4) + 1, TO_BYTE_INDEX(Index5),
   6395         TO_BYTE_INDEX(Index5) + 1, TO_BYTE_INDEX(Index6),
   6396         TO_BYTE_INDEX(Index6) + 1, TO_BYTE_INDEX(Index7),
   6397         TO_BYTE_INDEX(Index7) + 1);
   6398 #undef TO_BYTE_INDEX
   6399     return;
   6400   }
   6401   case IceType_v4i1:
   6402   case IceType_v4i32:
   6403   case IceType_v4f32: {
   6404     static constexpr SizeT ExpectedNumElements = 4;
   6405     assert(ExpectedNumElements == Instr->getNumIndexes());
   6406     const SizeT Index0 = Instr->getIndexValue(0);
   6407     const SizeT Index1 = Instr->getIndexValue(1);
   6408     const SizeT Index2 = Instr->getIndexValue(2);
   6409     const SizeT Index3 = Instr->getIndexValue(3);
   6410     Variable *T = nullptr;
   6411     switch (makeSrcSwitchMask(Index0, Index1, Index2, Index3)) {
   6412 #define CASE_SRCS_IN(S0, S1, S2, S3)                                           \
   6413   case (((S0) << 0) | ((S1) << 1) | ((S2) << 2) | ((S3) << 3))
   6414       CASE_SRCS_IN(0, 0, 0, 0) : {
   6415         T = lowerShuffleVector_AllFromSameSrc(Src0, Index0, Index1, Index2,
   6416                                               Index3);
   6417       }
   6418       break;
   6419       CASE_SRCS_IN(0, 0, 0, 1) : {
   6420         assert(false && "Following code is untested but likely correct; test "
   6421                         "and remove assert.");
   6422         auto *Unified = lowerShuffleVector_UnifyFromDifferentSrcs(Src0, Index2,
   6423                                                                   Src1, Index3);
   6424         T = lowerShuffleVector_TwoFromSameSrc(Src0, Index0, Index1, Unified,
   6425                                               UNIFIED_INDEX_0, UNIFIED_INDEX_1);
   6426       }
   6427       break;
   6428       CASE_SRCS_IN(0, 0, 1, 0) : {
   6429         auto *Unified = lowerShuffleVector_UnifyFromDifferentSrcs(Src1, Index2,
   6430                                                                   Src0, Index3);
   6431         T = lowerShuffleVector_TwoFromSameSrc(Src0, Index0, Index1, Unified,
   6432                                               UNIFIED_INDEX_0, UNIFIED_INDEX_1);
   6433       }
   6434       break;
   6435       CASE_SRCS_IN(0, 0, 1, 1) : {
   6436         T = lowerShuffleVector_TwoFromSameSrc(Src0, Index0, Index1, Src1,
   6437                                               Index2, Index3);
   6438       }
   6439       break;
   6440       CASE_SRCS_IN(0, 1, 0, 0) : {
   6441         auto *Unified = lowerShuffleVector_UnifyFromDifferentSrcs(Src0, Index0,
   6442                                                                   Src1, Index1);
   6443         T = lowerShuffleVector_TwoFromSameSrc(
   6444             Unified, UNIFIED_INDEX_0, UNIFIED_INDEX_1, Src0, Index2, Index3);
   6445       }
   6446       break;
   6447       CASE_SRCS_IN(0, 1, 0, 1) : {
   6448         if (Index0 == 0 && (Index1 - ExpectedNumElements) == 0 && Index2 == 1 &&
   6449             (Index3 - ExpectedNumElements) == 1) {
   6450           auto *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
   6451           auto *Src0R = legalizeToReg(Src0);
   6452           T = makeReg(DestTy);
   6453           _movp(T, Src0R);
   6454           _punpckl(T, Src1RM);
   6455         } else if (Index0 == Index2 && Index1 == Index3) {
   6456           assert(false && "Following code is untested but likely correct; test "
   6457                           "and remove assert.");
   6458           auto *Unified = lowerShuffleVector_UnifyFromDifferentSrcs(
   6459               Src0, Index0, Src1, Index1);
   6460           T = lowerShuffleVector_AllFromSameSrc(
   6461               Unified, UNIFIED_INDEX_0, UNIFIED_INDEX_1, UNIFIED_INDEX_0,
   6462               UNIFIED_INDEX_1);
   6463         } else {
   6464           auto *Unified0 = lowerShuffleVector_UnifyFromDifferentSrcs(
   6465               Src0, Index0, Src1, Index1);
   6466           auto *Unified1 = lowerShuffleVector_UnifyFromDifferentSrcs(
   6467               Src0, Index2, Src1, Index3);
   6468           T = lowerShuffleVector_TwoFromSameSrc(
   6469               Unified0, UNIFIED_INDEX_0, UNIFIED_INDEX_1, Unified1,
   6470               UNIFIED_INDEX_0, UNIFIED_INDEX_1);
   6471         }
   6472       }
   6473       break;
   6474       CASE_SRCS_IN(0, 1, 1, 0) : {
   6475         if (Index0 == Index3 && Index1 == Index2) {
   6476           auto *Unified = lowerShuffleVector_UnifyFromDifferentSrcs(
   6477               Src0, Index0, Src1, Index1);
   6478           T = lowerShuffleVector_AllFromSameSrc(
   6479               Unified, UNIFIED_INDEX_0, UNIFIED_INDEX_1, UNIFIED_INDEX_1,
   6480               UNIFIED_INDEX_0);
   6481         } else {
   6482           auto *Unified0 = lowerShuffleVector_UnifyFromDifferentSrcs(
   6483               Src0, Index0, Src1, Index1);
   6484           auto *Unified1 = lowerShuffleVector_UnifyFromDifferentSrcs(
   6485               Src1, Index2, Src0, Index3);
   6486           T = lowerShuffleVector_TwoFromSameSrc(
   6487               Unified0, UNIFIED_INDEX_0, UNIFIED_INDEX_1, Unified1,
   6488               UNIFIED_INDEX_0, UNIFIED_INDEX_1);
   6489         }
   6490       }
   6491       break;
   6492       CASE_SRCS_IN(0, 1, 1, 1) : {
   6493         assert(false && "Following code is untested but likely correct; test "
   6494                         "and remove assert.");
   6495         auto *Unified = lowerShuffleVector_UnifyFromDifferentSrcs(Src0, Index0,
   6496                                                                   Src1, Index1);
   6497         T = lowerShuffleVector_TwoFromSameSrc(
   6498             Unified, UNIFIED_INDEX_0, UNIFIED_INDEX_1, Src1, Index2, Index3);
   6499       }
   6500       break;
   6501       CASE_SRCS_IN(1, 0, 0, 0) : {
   6502         auto *Unified = lowerShuffleVector_UnifyFromDifferentSrcs(Src1, Index0,
   6503                                                                   Src0, Index1);
   6504         T = lowerShuffleVector_TwoFromSameSrc(
   6505             Unified, UNIFIED_INDEX_0, UNIFIED_INDEX_1, Src0, Index2, Index3);
   6506       }
   6507       break;
   6508       CASE_SRCS_IN(1, 0, 0, 1) : {
   6509         if (Index0 == Index3 && Index1 == Index2) {
   6510           assert(false && "Following code is untested but likely correct; test "
   6511                           "and remove assert.");
   6512           auto *Unified = lowerShuffleVector_UnifyFromDifferentSrcs(
   6513               Src1, Index0, Src0, Index1);
   6514           T = lowerShuffleVector_AllFromSameSrc(
   6515               Unified, UNIFIED_INDEX_0, UNIFIED_INDEX_1, UNIFIED_INDEX_1,
   6516               UNIFIED_INDEX_0);
   6517         } else {
   6518           assert(false && "Following code is untested but likely correct; test "
   6519                           "and remove assert.");
   6520           auto *Unified0 = lowerShuffleVector_UnifyFromDifferentSrcs(
   6521               Src1, Index0, Src0, Index1);
   6522           auto *Unified1 = lowerShuffleVector_UnifyFromDifferentSrcs(
   6523               Src0, Index2, Src1, Index3);
   6524           T = lowerShuffleVector_TwoFromSameSrc(
   6525               Unified0, UNIFIED_INDEX_0, UNIFIED_INDEX_1, Unified1,
   6526               UNIFIED_INDEX_0, UNIFIED_INDEX_1);
   6527         }
   6528       }
   6529       break;
   6530       CASE_SRCS_IN(1, 0, 1, 0) : {
   6531         if ((Index0 - ExpectedNumElements) == 0 && Index1 == 0 &&
   6532             (Index2 - ExpectedNumElements) == 1 && Index3 == 1) {
   6533           auto *Src1RM = legalize(Src0, Legal_Reg | Legal_Mem);
   6534           auto *Src0R = legalizeToReg(Src1);
   6535           T = makeReg(DestTy);
   6536           _movp(T, Src0R);
   6537           _punpckl(T, Src1RM);
   6538         } else if (Index0 == Index2 && Index1 == Index3) {
   6539           auto *Unified = lowerShuffleVector_UnifyFromDifferentSrcs(
   6540               Src1, Index0, Src0, Index1);
   6541           T = lowerShuffleVector_AllFromSameSrc(
   6542               Unified, UNIFIED_INDEX_0, UNIFIED_INDEX_1, UNIFIED_INDEX_0,
   6543               UNIFIED_INDEX_1);
   6544         } else {
   6545           auto *Unified0 = lowerShuffleVector_UnifyFromDifferentSrcs(
   6546               Src1, Index0, Src0, Index1);
   6547           auto *Unified1 = lowerShuffleVector_UnifyFromDifferentSrcs(
   6548               Src1, Index2, Src0, Index3);
   6549           T = lowerShuffleVector_TwoFromSameSrc(
   6550               Unified0, UNIFIED_INDEX_0, UNIFIED_INDEX_1, Unified1,
   6551               UNIFIED_INDEX_0, UNIFIED_INDEX_1);
   6552         }
   6553       }
   6554       break;
   6555       CASE_SRCS_IN(1, 0, 1, 1) : {
   6556         assert(false && "Following code is untested but likely correct; test "
   6557                         "and remove assert.");
   6558         auto *Unified = lowerShuffleVector_UnifyFromDifferentSrcs(Src1, Index0,
   6559                                                                   Src0, Index1);
   6560         T = lowerShuffleVector_TwoFromSameSrc(
   6561             Unified, UNIFIED_INDEX_0, UNIFIED_INDEX_1, Src1, Index2, Index3);
   6562       }
   6563       break;
   6564       CASE_SRCS_IN(1, 1, 0, 0) : {
   6565         T = lowerShuffleVector_TwoFromSameSrc(Src1, Index0, Index1, Src0,
   6566                                               Index2, Index3);
   6567       }
   6568       break;
   6569       CASE_SRCS_IN(1, 1, 0, 1) : {
   6570         assert(false && "Following code is untested but likely correct; test "
   6571                         "and remove assert.");
   6572         auto *Unified = lowerShuffleVector_UnifyFromDifferentSrcs(Src0, Index2,
   6573                                                                   Src1, Index3);
   6574         T = lowerShuffleVector_TwoFromSameSrc(Src1, Index0, Index1, Unified,
   6575                                               UNIFIED_INDEX_0, UNIFIED_INDEX_1);
   6576       }
   6577       break;
   6578       CASE_SRCS_IN(1, 1, 1, 0) : {
   6579         auto *Unified = lowerShuffleVector_UnifyFromDifferentSrcs(Src1, Index2,
   6580                                                                   Src0, Index3);
   6581         T = lowerShuffleVector_TwoFromSameSrc(Src1, Index0, Index1, Unified,
   6582                                               UNIFIED_INDEX_0, UNIFIED_INDEX_1);
   6583       }
   6584       break;
   6585       CASE_SRCS_IN(1, 1, 1, 1) : {
   6586         assert(false && "Following code is untested but likely correct; test "
   6587                         "and remove assert.");
   6588         T = lowerShuffleVector_AllFromSameSrc(Src1, Index0, Index1, Index2,
   6589                                               Index3);
   6590       }
   6591       break;
   6592 #undef CASE_SRCS_IN
   6593     }
   6594 
   6595     assert(T != nullptr);
   6596     assert(T->getType() == DestTy);
   6597     _movp(Dest, T);
   6598     return;
   6599   } break;
   6600   }
   6601 
   6602   // Unoptimized shuffle. Perform a series of inserts and extracts.
   6603   Context.insert<InstFakeDef>(T);
   6604   const Type ElementType = typeElementType(DestTy);
   6605   for (SizeT I = 0; I < Instr->getNumIndexes(); ++I) {
   6606     auto *Index = Instr->getIndex(I);
   6607     const SizeT Elem = Index->getValue();
   6608     auto *ExtElmt = makeReg(ElementType);
   6609     if (Elem < NumElements) {
   6610       lowerExtractElement(
   6611           InstExtractElement::create(Func, ExtElmt, Src0, Index));
   6612     } else {
   6613       lowerExtractElement(InstExtractElement::create(
   6614           Func, ExtElmt, Src1, Ctx->getConstantInt32(Elem - NumElements)));
   6615     }
   6616     auto *NewT = makeReg(DestTy);
   6617     lowerInsertElement(InstInsertElement::create(Func, NewT, T, ExtElmt,
   6618                                                  Ctx->getConstantInt32(I)));
   6619     T = NewT;
   6620   }
   6621   _movp(Dest, T);
   6622 }
   6623 
   6624 template <typename TraitsType>
   6625 void TargetX86Base<TraitsType>::lowerSelect(const InstSelect *Select) {
   6626   Variable *Dest = Select->getDest();
   6627 
   6628   Operand *Condition = Select->getCondition();
   6629   // Handle folding opportunities.
   6630   if (const Inst *Producer = FoldingInfo.getProducerFor(Condition)) {
   6631     assert(Producer->isDeleted());
   6632     switch (BoolFolding<Traits>::getProducerKind(Producer)) {
   6633     default:
   6634       break;
   6635     case BoolFolding<Traits>::PK_Icmp32:
   6636     case BoolFolding<Traits>::PK_Icmp64: {
   6637       lowerIcmpAndConsumer(llvm::cast<InstIcmp>(Producer), Select);
   6638       return;
   6639     }
   6640     case BoolFolding<Traits>::PK_Fcmp: {
   6641       lowerFcmpAndConsumer(llvm::cast<InstFcmp>(Producer), Select);
   6642       return;
   6643     }
   6644     }
   6645   }
   6646 
   6647   if (isVectorType(Dest->getType())) {
   6648     lowerSelectVector(Select);
   6649     return;
   6650   }
   6651 
   6652   Operand *CmpResult = legalize(Condition, Legal_Reg | Legal_Mem);
   6653   Operand *Zero = Ctx->getConstantZero(IceType_i32);
   6654   _cmp(CmpResult, Zero);
   6655   Operand *SrcT = Select->getTrueOperand();
   6656   Operand *SrcF = Select->getFalseOperand();
   6657   const BrCond Cond = Traits::Cond::Br_ne;
   6658   lowerSelectMove(Dest, Cond, SrcT, SrcF);
   6659 }
   6660 
   6661 template <typename TraitsType>
   6662 void TargetX86Base<TraitsType>::lowerSelectMove(Variable *Dest, BrCond Cond,
   6663                                                 Operand *SrcT, Operand *SrcF) {
   6664   Type DestTy = Dest->getType();
   6665   if (typeWidthInBytes(DestTy) == 1 || isFloatingType(DestTy)) {
   6666     // The cmov instruction doesn't allow 8-bit or FP operands, so we need
   6667     // explicit control flow.
   6668     // d=cmp e,f; a=d?b:c ==> cmp e,f; a=b; jne L1; a=c; L1:
   6669     auto *Label = InstX86Label::create(Func, this);
   6670     SrcT = legalize(SrcT, Legal_Reg | Legal_Imm);
   6671     _mov(Dest, SrcT);
   6672     _br(Cond, Label);
   6673     SrcF = legalize(SrcF, Legal_Reg | Legal_Imm);
   6674     _redefined(_mov(Dest, SrcF));
   6675     Context.insert(Label);
   6676     return;
   6677   }
   6678   // mov t, SrcF; cmov_cond t, SrcT; mov dest, t
   6679   // But if SrcT is immediate, we might be able to do better, as the cmov
   6680   // instruction doesn't allow an immediate operand:
   6681   // mov t, SrcT; cmov_!cond t, SrcF; mov dest, t
   6682   if (llvm::isa<Constant>(SrcT) && !llvm::isa<Constant>(SrcF)) {
   6683     std::swap(SrcT, SrcF);
   6684     Cond = InstImpl<TraitsType>::InstX86Base::getOppositeCondition(Cond);
   6685   }
   6686   if (!Traits::Is64Bit && DestTy == IceType_i64) {
   6687     SrcT = legalizeUndef(SrcT);
   6688     SrcF = legalizeUndef(SrcF);
   6689     // Set the low portion.
   6690     auto *DestLo = llvm::cast<Variable>(loOperand(Dest));
   6691     lowerSelectIntMove(DestLo, Cond, loOperand(SrcT), loOperand(SrcF));
   6692     // Set the high portion.
   6693     auto *DestHi = llvm::cast<Variable>(hiOperand(Dest));
   6694     lowerSelectIntMove(DestHi, Cond, hiOperand(SrcT), hiOperand(SrcF));
   6695     return;
   6696   }
   6697 
   6698   assert(DestTy == IceType_i16 || DestTy == IceType_i32 ||
   6699          (Traits::Is64Bit && DestTy == IceType_i64));
   6700   lowerSelectIntMove(Dest, Cond, SrcT, SrcF);
   6701 }
   6702 
   6703 template <typename TraitsType>
   6704 void TargetX86Base<TraitsType>::lowerSelectIntMove(Variable *Dest, BrCond Cond,
   6705                                                    Operand *SrcT,
   6706                                                    Operand *SrcF) {
   6707   Variable *T = nullptr;
   6708   SrcF = legalize(SrcF);
   6709   _mov(T, SrcF);
   6710   SrcT = legalize(SrcT, Legal_Reg | Legal_Mem);
   6711   _cmov(T, SrcT, Cond);
   6712   _mov(Dest, T);
   6713 }
   6714 
   6715 template <typename TraitsType>
   6716 void TargetX86Base<TraitsType>::lowerMove(Variable *Dest, Operand *Src,
   6717                                           bool IsRedefinition) {
   6718   assert(Dest->getType() == Src->getType());
   6719   assert(!Dest->isRematerializable());
   6720   if (!Traits::Is64Bit && Dest->getType() == IceType_i64) {
   6721     Src = legalize(Src);
   6722     Operand *SrcLo = loOperand(Src);
   6723     Operand *SrcHi = hiOperand(Src);
   6724     auto *DestLo = llvm::cast<Variable>(loOperand(Dest));
   6725     auto *DestHi = llvm::cast<Variable>(hiOperand(Dest));
   6726     Variable *T_Lo = nullptr, *T_Hi = nullptr;
   6727     _mov(T_Lo, SrcLo);
   6728     _redefined(_mov(DestLo, T_Lo), IsRedefinition);
   6729     _mov(T_Hi, SrcHi);
   6730     _redefined(_mov(DestHi, T_Hi), IsRedefinition);
   6731   } else {
   6732     Operand *SrcLegal;
   6733     if (Dest->hasReg()) {
   6734       // If Dest already has a physical register, then only basic legalization
   6735       // is needed, as the source operand can be a register, immediate, or
   6736       // memory.
   6737       SrcLegal = legalize(Src, Legal_Reg, Dest->getRegNum());
   6738     } else {
   6739       // If Dest could be a stack operand, then RI must be a physical register
   6740       // or a scalar integer immediate.
   6741       SrcLegal = legalize(Src, Legal_Reg | Legal_Imm);
   6742     }
   6743     if (isVectorType(Dest->getType())) {
   6744       _redefined(_movp(Dest, SrcLegal), IsRedefinition);
   6745     } else {
   6746       _redefined(_mov(Dest, SrcLegal), IsRedefinition);
   6747     }
   6748   }
   6749 }
   6750 
   6751 template <typename TraitsType>
   6752 bool TargetX86Base<TraitsType>::lowerOptimizeFcmpSelect(
   6753     const InstFcmp *Fcmp, const InstSelect *Select) {
   6754   Operand *CmpSrc0 = Fcmp->getSrc(0);
   6755   Operand *CmpSrc1 = Fcmp->getSrc(1);
   6756   Operand *SelectSrcT = Select->getTrueOperand();
   6757   Operand *SelectSrcF = Select->getFalseOperand();
   6758   Variable *SelectDest = Select->getDest();
   6759 
   6760   // TODO(capn): also handle swapped compare/select operand order.
   6761   if (CmpSrc0 != SelectSrcT || CmpSrc1 != SelectSrcF)
   6762     return false;
   6763 
   6764   // TODO(sehr, stichnot): fcmp/select patterns (e.g., minsd/maxss) go here.
   6765   InstFcmp::FCond Condition = Fcmp->getCondition();
   6766   switch (Condition) {
   6767   default:
   6768     return false;
   6769   case InstFcmp::True:
   6770     break;
   6771   case InstFcmp::False:
   6772     break;
   6773   case InstFcmp::Ogt: {
   6774     Variable *T = makeReg(SelectDest->getType());
   6775     if (isScalarFloatingType(SelectSrcT->getType())) {
   6776       _mov(T, legalize(SelectSrcT, Legal_Reg | Legal_Mem));
   6777       _maxss(T, legalize(SelectSrcF, Legal_Reg | Legal_Mem));
   6778       _mov(SelectDest, T);
   6779     } else {
   6780       _movp(T, legalize(SelectSrcT, Legal_Reg | Legal_Mem));
   6781       _maxps(T, legalize(SelectSrcF, Legal_Reg | Legal_Mem));
   6782       _movp(SelectDest, T);
   6783     }
   6784     return true;
   6785   } break;
   6786   case InstFcmp::Olt: {
   6787     Variable *T = makeReg(SelectSrcT->getType());
   6788     if (isScalarFloatingType(SelectSrcT->getType())) {
   6789       _mov(T, legalize(SelectSrcT, Legal_Reg | Legal_Mem));
   6790       _minss(T, legalize(SelectSrcF, Legal_Reg | Legal_Mem));
   6791       _mov(SelectDest, T);
   6792     } else {
   6793       _movp(T, legalize(SelectSrcT, Legal_Reg | Legal_Mem));
   6794       _minps(T, legalize(SelectSrcF, Legal_Reg | Legal_Mem));
   6795       _movp(SelectDest, T);
   6796     }
   6797     return true;
   6798   } break;
   6799   }
   6800   return false;
   6801 }
   6802 
   6803 template <typename TraitsType>
   6804 void TargetX86Base<TraitsType>::lowerIcmp(const InstIcmp *Icmp) {
   6805   Variable *Dest = Icmp->getDest();
   6806   if (isVectorType(Dest->getType())) {
   6807     lowerIcmpVector(Icmp);
   6808   } else {
   6809     constexpr Inst *Consumer = nullptr;
   6810     lowerIcmpAndConsumer(Icmp, Consumer);
   6811   }
   6812 }
   6813 
   6814 template <typename TraitsType>
   6815 void TargetX86Base<TraitsType>::lowerSelectVector(const InstSelect *Instr) {
   6816   Variable *Dest = Instr->getDest();
   6817   Type DestTy = Dest->getType();
   6818   Operand *SrcT = Instr->getTrueOperand();
   6819   Operand *SrcF = Instr->getFalseOperand();
   6820   Operand *Condition = Instr->getCondition();
   6821 
   6822   if (!isVectorType(DestTy))
   6823     llvm::report_fatal_error("Expected a vector select");
   6824 
   6825   Type SrcTy = SrcT->getType();
   6826   Variable *T = makeReg(SrcTy);
   6827   Operand *SrcTRM = legalize(SrcT, Legal_Reg | Legal_Mem);
   6828   Operand *SrcFRM = legalize(SrcF, Legal_Reg | Legal_Mem);
   6829 
   6830   if (InstructionSet >= Traits::SSE4_1) {
   6831     // TODO(wala): If the condition operand is a constant, use blendps or
   6832     // pblendw.
   6833     //
   6834     // Use blendvps or pblendvb to implement select.
   6835     if (SrcTy == IceType_v4i1 || SrcTy == IceType_v4i32 ||
   6836         SrcTy == IceType_v4f32) {
   6837       Operand *ConditionRM = legalize(Condition, Legal_Reg | Legal_Mem);
   6838       Variable *xmm0 = makeReg(IceType_v4i32, Traits::RegisterSet::Reg_xmm0);
   6839       _movp(xmm0, ConditionRM);
   6840       _psll(xmm0, Ctx->getConstantInt8(31));
   6841       _movp(T, SrcFRM);
   6842       _blendvps(T, SrcTRM, xmm0);
   6843       _movp(Dest, T);
   6844     } else {
   6845       assert(typeNumElements(SrcTy) == 8 || typeNumElements(SrcTy) == 16);
   6846       Type SignExtTy =
   6847           Condition->getType() == IceType_v8i1 ? IceType_v8i16 : IceType_v16i8;
   6848       Variable *xmm0 = makeReg(SignExtTy, Traits::RegisterSet::Reg_xmm0);
   6849       lowerCast(InstCast::create(Func, InstCast::Sext, xmm0, Condition));
   6850       _movp(T, SrcFRM);
   6851       _pblendvb(T, SrcTRM, xmm0);
   6852       _movp(Dest, T);
   6853     }
   6854     return;
   6855   }
   6856   // Lower select without Traits::SSE4.1:
   6857   // a=d?b:c ==>
   6858   //   if elementtype(d) != i1:
   6859   //      d=sext(d);
   6860   //   a=(b&d)|(c&~d);
   6861   Variable *T2 = makeReg(SrcTy);
   6862   // Sign extend the condition operand if applicable.
   6863   if (SrcTy == IceType_v4f32) {
   6864     // The sext operation takes only integer arguments.
   6865     Variable *T3 = Func->makeVariable(IceType_v4i32);
   6866     lowerCast(InstCast::create(Func, InstCast::Sext, T3, Condition));
   6867     _movp(T, T3);
   6868   } else if (typeElementType(SrcTy) != IceType_i1) {
   6869     lowerCast(InstCast::create(Func, InstCast::Sext, T, Condition));
   6870   } else {
   6871     Operand *ConditionRM = legalize(Condition, Legal_Reg | Legal_Mem);
   6872     _movp(T, ConditionRM);
   6873   }
   6874   _movp(T2, T);
   6875   _pand(T, SrcTRM);
   6876   _pandn(T2, SrcFRM);
   6877   _por(T, T2);
   6878   _movp(Dest, T);
   6879 
   6880   return;
   6881 }
   6882 
   6883 template <typename TraitsType>
   6884 void TargetX86Base<TraitsType>::lowerStore(const InstStore *Instr) {
   6885   Operand *Value = Instr->getData();
   6886   Operand *Addr = Instr->getAddr();
   6887   X86OperandMem *NewAddr = formMemoryOperand(Addr, Value->getType());
   6888   doMockBoundsCheck(NewAddr);
   6889   Type Ty = NewAddr->getType();
   6890 
   6891   if (!Traits::Is64Bit && Ty == IceType_i64) {
   6892     Value = legalizeUndef(Value);
   6893     Operand *ValueHi = legalize(hiOperand(Value), Legal_Reg | Legal_Imm);
   6894     _store(ValueHi, llvm::cast<X86OperandMem>(hiOperand(NewAddr)));
   6895     Operand *ValueLo = legalize(loOperand(Value), Legal_Reg | Legal_Imm);
   6896     _store(ValueLo, llvm::cast<X86OperandMem>(loOperand(NewAddr)));
   6897   } else if (isVectorType(Ty)) {
   6898     _storep(legalizeToReg(Value), NewAddr);
   6899   } else {
   6900     Value = legalize(Value, Legal_Reg | Legal_Imm);
   6901     _store(Value, NewAddr);
   6902   }
   6903 }
   6904 
   6905 template <typename TraitsType>
   6906 void TargetX86Base<TraitsType>::doAddressOptStore() {
   6907   auto *Instr = llvm::cast<InstStore>(Context.getCur());
   6908   Operand *Addr = Instr->getAddr();
   6909   Operand *Data = Instr->getData();
   6910   if (auto *OptAddr = computeAddressOpt(Instr, Data->getType(), Addr)) {
   6911     Instr->setDeleted();
   6912     auto *NewStore = Context.insert<InstStore>(Data, OptAddr);
   6913     if (Instr->getDest())
   6914       NewStore->setRmwBeacon(Instr->getRmwBeacon());
   6915   }
   6916 }
   6917 
   6918 template <typename TraitsType>
   6919 void TargetX86Base<TraitsType>::doAddressOptStoreSubVector() {
   6920   auto *Intrinsic = llvm::cast<InstIntrinsicCall>(Context.getCur());
   6921   Operand *Addr = Intrinsic->getArg(1);
   6922   Operand *Data = Intrinsic->getArg(0);
   6923   if (auto *OptAddr = computeAddressOpt(Intrinsic, Data->getType(), Addr)) {
   6924     Intrinsic->setDeleted();
   6925     const Ice::Intrinsics::IntrinsicInfo Info = {
   6926         Ice::Intrinsics::StoreSubVector, Ice::Intrinsics::SideEffects_T,
   6927         Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_T};
   6928     auto Target = Ctx->getConstantUndef(Ice::IceType_i32);
   6929     auto *NewStore =
   6930         Context.insert<InstIntrinsicCall>(3, nullptr, Target, Info);
   6931     NewStore->addArg(Data);
   6932     NewStore->addArg(OptAddr);
   6933     NewStore->addArg(Intrinsic->getArg(2));
   6934   }
   6935 }
   6936 
   6937 template <typename TraitsType>
   6938 Operand *TargetX86Base<TraitsType>::lowerCmpRange(Operand *Comparison,
   6939                                                   uint64_t Min, uint64_t Max) {
   6940   // TODO(ascull): 64-bit should not reach here but only because it is not
   6941   // implemented yet. This should be able to handle the 64-bit case.
   6942   assert(Traits::Is64Bit || Comparison->getType() != IceType_i64);
   6943   // Subtracting 0 is a nop so don't do it
   6944   if (Min != 0) {
   6945     // Avoid clobbering the comparison by copying it
   6946     Variable *T = nullptr;
   6947     _mov(T, Comparison);
   6948     _sub(T, Ctx->getConstantInt32(Min));
   6949     Comparison = T;
   6950   }
   6951 
   6952   _cmp(Comparison, Ctx->getConstantInt32(Max - Min));
   6953 
   6954   return Comparison;
   6955 }
   6956 
   6957 template <typename TraitsType>
   6958 void TargetX86Base<TraitsType>::lowerCaseCluster(const CaseCluster &Case,
   6959                                                  Operand *Comparison,
   6960                                                  bool DoneCmp,
   6961                                                  CfgNode *DefaultTarget) {
   6962   switch (Case.getKind()) {
   6963   case CaseCluster::JumpTable: {
   6964     InstX86Label *SkipJumpTable;
   6965 
   6966     Operand *RangeIndex =
   6967         lowerCmpRange(Comparison, Case.getLow(), Case.getHigh());
   6968     if (DefaultTarget == nullptr) {
   6969       // Skip over jump table logic if comparison not in range and no default
   6970       SkipJumpTable = InstX86Label::create(Func, this);
   6971       _br(Traits::Cond::Br_a, SkipJumpTable);
   6972     } else {
   6973       _br(Traits::Cond::Br_a, DefaultTarget);
   6974     }
   6975 
   6976     InstJumpTable *JumpTable = Case.getJumpTable();
   6977     Context.insert(JumpTable);
   6978 
   6979     // Make sure the index is a register of the same width as the base
   6980     Variable *Index;
   6981     const Type PointerType = getPointerType();
   6982     if (RangeIndex->getType() != PointerType) {
   6983       Index = makeReg(PointerType);
   6984       if (RangeIndex->getType() == IceType_i64) {
   6985         assert(Traits::Is64Bit);
   6986         _mov(Index, RangeIndex); // trunc
   6987       } else {
   6988         Operand *RangeIndexRM = legalize(RangeIndex, Legal_Reg | Legal_Mem);
   6989         _movzx(Index, RangeIndexRM);
   6990       }
   6991     } else {
   6992       Index = legalizeToReg(RangeIndex);
   6993     }
   6994 
   6995     constexpr RelocOffsetT RelocOffset = 0;
   6996     constexpr Variable *NoBase = nullptr;
   6997     constexpr Constant *NoOffset = nullptr;
   6998     auto JTName = GlobalString::createWithString(Ctx, JumpTable->getName());
   6999     Constant *Offset = Ctx->getConstantSym(RelocOffset, JTName);
   7000     uint16_t Shift = typeWidthInBytesLog2(PointerType);
   7001     constexpr auto Segment = X86OperandMem::SegmentRegisters::DefaultSegment;
   7002 
   7003     Variable *Target = nullptr;
   7004     if (Traits::Is64Bit && NeedSandboxing) {
   7005       assert(Index != nullptr && Index->getType() == IceType_i32);
   7006     }
   7007 
   7008     if (PointerType == IceType_i32) {
   7009       _mov(Target, X86OperandMem::create(Func, PointerType, NoBase, Offset,
   7010                                          Index, Shift, Segment));
   7011     } else {
   7012       auto *Base = makeReg(IceType_i64);
   7013       _lea(Base, X86OperandMem::create(Func, IceType_void, NoBase, Offset));
   7014       _mov(Target, X86OperandMem::create(Func, PointerType, Base, NoOffset,
   7015                                          Index, Shift, Segment));
   7016     }
   7017 
   7018     lowerIndirectJump(Target);
   7019 
   7020     if (DefaultTarget == nullptr)
   7021       Context.insert(SkipJumpTable);
   7022     return;
   7023   }
   7024   case CaseCluster::Range: {
   7025     if (Case.isUnitRange()) {
   7026       // Single item
   7027       if (!DoneCmp) {
   7028         Constant *Value = Ctx->getConstantInt32(Case.getLow());
   7029         _cmp(Comparison, Value);
   7030       }
   7031       _br(Traits::Cond::Br_e, Case.getTarget());
   7032     } else if (DoneCmp && Case.isPairRange()) {
   7033       // Range of two items with first item aleady compared against
   7034       _br(Traits::Cond::Br_e, Case.getTarget());
   7035       Constant *Value = Ctx->getConstantInt32(Case.getHigh());
   7036       _cmp(Comparison, Value);
   7037       _br(Traits::Cond::Br_e, Case.getTarget());
   7038     } else {
   7039       // Range
   7040       lowerCmpRange(Comparison, Case.getLow(), Case.getHigh());
   7041       _br(Traits::Cond::Br_be, Case.getTarget());
   7042     }
   7043     if (DefaultTarget != nullptr)
   7044       _br(DefaultTarget);
   7045     return;
   7046   }
   7047   }
   7048 }
   7049 
   7050 template <typename TraitsType>
   7051 void TargetX86Base<TraitsType>::lowerSwitch(const InstSwitch *Instr) {
   7052   // Group cases together and navigate through them with a binary search
   7053   CaseClusterArray CaseClusters = CaseCluster::clusterizeSwitch(Func, Instr);
   7054   Operand *Src0 = Instr->getComparison();
   7055   CfgNode *DefaultTarget = Instr->getLabelDefault();
   7056 
   7057   assert(CaseClusters.size() != 0); // Should always be at least one
   7058 
   7059   if (!Traits::Is64Bit && Src0->getType() == IceType_i64) {
   7060     Src0 = legalize(Src0); // get Base/Index into physical registers
   7061     Operand *Src0Lo = loOperand(Src0);
   7062     Operand *Src0Hi = hiOperand(Src0);
   7063     if (CaseClusters.back().getHigh() > UINT32_MAX) {
   7064       // TODO(ascull): handle 64-bit case properly (currently naive version)
   7065       // This might be handled by a higher level lowering of switches.
   7066       SizeT NumCases = Instr->getNumCases();
   7067       if (NumCases >= 2) {
   7068         Src0Lo = legalizeToReg(Src0Lo);
   7069         Src0Hi = legalizeToReg(Src0Hi);
   7070       } else {
   7071         Src0Lo = legalize(Src0Lo, Legal_Reg | Legal_Mem);
   7072         Src0Hi = legalize(Src0Hi, Legal_Reg | Legal_Mem);
   7073       }
   7074       for (SizeT I = 0; I < NumCases; ++I) {
   7075         Constant *ValueLo = Ctx->getConstantInt32(Instr->getValue(I));
   7076         Constant *ValueHi = Ctx->getConstantInt32(Instr->getValue(I) >> 32);
   7077         InstX86Label *Label = InstX86Label::create(Func, this);
   7078         _cmp(Src0Lo, ValueLo);
   7079         _br(Traits::Cond::Br_ne, Label);
   7080         _cmp(Src0Hi, ValueHi);
   7081         _br(Traits::Cond::Br_e, Instr->getLabel(I));
   7082         Context.insert(Label);
   7083       }
   7084       _br(Instr->getLabelDefault());
   7085       return;
   7086     } else {
   7087       // All the values are 32-bit so just check the operand is too and then
   7088       // fall through to the 32-bit implementation. This is a common case.
   7089       Src0Hi = legalize(Src0Hi, Legal_Reg | Legal_Mem);
   7090       Constant *Zero = Ctx->getConstantInt32(0);
   7091       _cmp(Src0Hi, Zero);
   7092       _br(Traits::Cond::Br_ne, DefaultTarget);
   7093       Src0 = Src0Lo;
   7094     }
   7095   }
   7096 
   7097   // 32-bit lowering
   7098 
   7099   if (CaseClusters.size() == 1) {
   7100     // Jump straight to default if needed. Currently a common case as jump
   7101     // tables occur on their own.
   7102     constexpr bool DoneCmp = false;
   7103     lowerCaseCluster(CaseClusters.front(), Src0, DoneCmp, DefaultTarget);
   7104     return;
   7105   }
   7106 
   7107   // Going to be using multiple times so get it in a register early
   7108   Variable *Comparison = legalizeToReg(Src0);
   7109 
   7110   // A span is over the clusters
   7111   struct SearchSpan {
   7112     SearchSpan(SizeT Begin, SizeT Size, InstX86Label *Label)
   7113         : Begin(Begin), Size(Size), Label(Label) {}
   7114 
   7115     SizeT Begin;
   7116     SizeT Size;
   7117     InstX86Label *Label;
   7118   };
   7119   // The stack will only grow to the height of the tree so 12 should be plenty
   7120   std::stack<SearchSpan, llvm::SmallVector<SearchSpan, 12>> SearchSpanStack;
   7121   SearchSpanStack.emplace(0, CaseClusters.size(), nullptr);
   7122   bool DoneCmp = false;
   7123 
   7124   while (!SearchSpanStack.empty()) {
   7125     SearchSpan Span = SearchSpanStack.top();
   7126     SearchSpanStack.pop();
   7127 
   7128     if (Span.Label != nullptr)
   7129       Context.insert(Span.Label);
   7130 
   7131     switch (Span.Size) {
   7132     case 0:
   7133       llvm::report_fatal_error("Invalid SearchSpan size");
   7134       break;
   7135 
   7136     case 1:
   7137       lowerCaseCluster(CaseClusters[Span.Begin], Comparison, DoneCmp,
   7138                        SearchSpanStack.empty() ? nullptr : DefaultTarget);
   7139       DoneCmp = false;
   7140       break;
   7141 
   7142     case 2: {
   7143       const CaseCluster *CaseA = &CaseClusters[Span.Begin];
   7144       const CaseCluster *CaseB = &CaseClusters[Span.Begin + 1];
   7145 
   7146       // Placing a range last may allow register clobbering during the range
   7147       // test. That means there is no need to clone the register. If it is a
   7148       // unit range the comparison may have already been done in the binary
   7149       // search (DoneCmp) and so it should be placed first. If this is a range
   7150       // of two items and the comparison with the low value has already been
   7151       // done, comparing with the other element is cheaper than a range test.
   7152       // If the low end of the range is zero then there is no subtraction and
   7153       // nothing to be gained.
   7154       if (!CaseA->isUnitRange() &&
   7155           !(CaseA->getLow() == 0 || (DoneCmp && CaseA->isPairRange()))) {
   7156         std::swap(CaseA, CaseB);
   7157         DoneCmp = false;
   7158       }
   7159 
   7160       lowerCaseCluster(*CaseA, Comparison, DoneCmp);
   7161       DoneCmp = false;
   7162       lowerCaseCluster(*CaseB, Comparison, DoneCmp,
   7163                        SearchSpanStack.empty() ? nullptr : DefaultTarget);
   7164     } break;
   7165 
   7166     default:
   7167       // Pick the middle item and branch b or ae
   7168       SizeT PivotIndex = Span.Begin + (Span.Size / 2);
   7169       const CaseCluster &Pivot = CaseClusters[PivotIndex];
   7170       Constant *Value = Ctx->getConstantInt32(Pivot.getLow());
   7171       InstX86Label *Label = InstX86Label::create(Func, this);
   7172       _cmp(Comparison, Value);
   7173       // TODO(ascull): does it alway have to be far?
   7174       _br(Traits::Cond::Br_b, Label, InstX86Br::Far);
   7175       // Lower the left and (pivot+right) sides, falling through to the right
   7176       SearchSpanStack.emplace(Span.Begin, Span.Size / 2, Label);
   7177       SearchSpanStack.emplace(PivotIndex, Span.Size - (Span.Size / 2), nullptr);
   7178       DoneCmp = true;
   7179       break;
   7180     }
   7181   }
   7182 
   7183   _br(DefaultTarget);
   7184 }
   7185 
   7186 /// The following pattern occurs often in lowered C and C++ code:
   7187 ///
   7188 ///   %cmp     = fcmp/icmp pred <n x ty> %src0, %src1
   7189 ///   %cmp.ext = sext <n x i1> %cmp to <n x ty>
   7190 ///
   7191 /// We can eliminate the sext operation by copying the result of pcmpeqd,
   7192 /// pcmpgtd, or cmpps (which produce sign extended results) to the result of the
   7193 /// sext operation.
   7194 template <typename TraitsType>
   7195 void TargetX86Base<TraitsType>::eliminateNextVectorSextInstruction(
   7196     Variable *SignExtendedResult) {
   7197   if (auto *NextCast =
   7198           llvm::dyn_cast_or_null<InstCast>(Context.getNextInst())) {
   7199     if (NextCast->getCastKind() == InstCast::Sext &&
   7200         NextCast->getSrc(0) == SignExtendedResult) {
   7201       NextCast->setDeleted();
   7202       _movp(NextCast->getDest(), legalizeToReg(SignExtendedResult));
   7203       // Skip over the instruction.
   7204       Context.advanceNext();
   7205     }
   7206   }
   7207 }
   7208 
   7209 template <typename TraitsType>
   7210 void TargetX86Base<TraitsType>::lowerUnreachable(
   7211     const InstUnreachable * /*Instr*/) {
   7212   _ud2();
   7213   // Add a fake use of esp to make sure esp adjustments after the unreachable
   7214   // do not get dead-code eliminated.
   7215   keepEspLiveAtExit();
   7216 }
   7217 
   7218 template <typename TraitsType>
   7219 void TargetX86Base<TraitsType>::lowerBreakpoint(
   7220     const InstBreakpoint * /*Instr*/) {
   7221   _int3();
   7222 }
   7223 
   7224 template <typename TraitsType>
   7225 void TargetX86Base<TraitsType>::lowerRMW(const InstX86FakeRMW *RMW) {
   7226   // If the beacon variable's live range does not end in this instruction, then
   7227   // it must end in the modified Store instruction that follows. This means
   7228   // that the original Store instruction is still there, either because the
   7229   // value being stored is used beyond the Store instruction, or because dead
   7230   // code elimination did not happen. In either case, we cancel RMW lowering
   7231   // (and the caller deletes the RMW instruction).
   7232   if (!RMW->isLastUse(RMW->getBeacon()))
   7233     return;
   7234   Operand *Src = RMW->getData();
   7235   Type Ty = Src->getType();
   7236   X86OperandMem *Addr = formMemoryOperand(RMW->getAddr(), Ty);
   7237   doMockBoundsCheck(Addr);
   7238   if (!Traits::Is64Bit && Ty == IceType_i64) {
   7239     Src = legalizeUndef(Src);
   7240     Operand *SrcLo = legalize(loOperand(Src), Legal_Reg | Legal_Imm);
   7241     Operand *SrcHi = legalize(hiOperand(Src), Legal_Reg | Legal_Imm);
   7242     auto *AddrLo = llvm::cast<X86OperandMem>(loOperand(Addr));
   7243     auto *AddrHi = llvm::cast<X86OperandMem>(hiOperand(Addr));
   7244     switch (RMW->getOp()) {
   7245     default:
   7246       // TODO(stichnot): Implement other arithmetic operators.
   7247       break;
   7248     case InstArithmetic::Add:
   7249       _add_rmw(AddrLo, SrcLo);
   7250       _adc_rmw(AddrHi, SrcHi);
   7251       return;
   7252     case InstArithmetic::Sub:
   7253       _sub_rmw(AddrLo, SrcLo);
   7254       _sbb_rmw(AddrHi, SrcHi);
   7255       return;
   7256     case InstArithmetic::And:
   7257       _and_rmw(AddrLo, SrcLo);
   7258       _and_rmw(AddrHi, SrcHi);
   7259       return;
   7260     case InstArithmetic::Or:
   7261       _or_rmw(AddrLo, SrcLo);
   7262       _or_rmw(AddrHi, SrcHi);
   7263       return;
   7264     case InstArithmetic::Xor:
   7265       _xor_rmw(AddrLo, SrcLo);
   7266       _xor_rmw(AddrHi, SrcHi);
   7267       return;
   7268     }
   7269   } else {
   7270     // x86-32: i8, i16, i32
   7271     // x86-64: i8, i16, i32, i64
   7272     switch (RMW->getOp()) {
   7273     default:
   7274       // TODO(stichnot): Implement other arithmetic operators.
   7275       break;
   7276     case InstArithmetic::Add:
   7277       Src = legalize(Src, Legal_Reg | Legal_Imm);
   7278       _add_rmw(Addr, Src);
   7279       return;
   7280     case InstArithmetic::Sub:
   7281       Src = legalize(Src, Legal_Reg | Legal_Imm);
   7282       _sub_rmw(Addr, Src);
   7283       return;
   7284     case InstArithmetic::And:
   7285       Src = legalize(Src, Legal_Reg | Legal_Imm);
   7286       _and_rmw(Addr, Src);
   7287       return;
   7288     case InstArithmetic::Or:
   7289       Src = legalize(Src, Legal_Reg | Legal_Imm);
   7290       _or_rmw(Addr, Src);
   7291       return;
   7292     case InstArithmetic::Xor:
   7293       Src = legalize(Src, Legal_Reg | Legal_Imm);
   7294       _xor_rmw(Addr, Src);
   7295       return;
   7296     }
   7297   }
   7298   llvm::report_fatal_error("Couldn't lower RMW instruction");
   7299 }
   7300 
   7301 template <typename TraitsType>
   7302 void TargetX86Base<TraitsType>::lowerOther(const Inst *Instr) {
   7303   if (const auto *RMW = llvm::dyn_cast<InstX86FakeRMW>(Instr)) {
   7304     lowerRMW(RMW);
   7305   } else {
   7306     TargetLowering::lowerOther(Instr);
   7307   }
   7308 }
   7309 
   7310 /// Turn an i64 Phi instruction into a pair of i32 Phi instructions, to preserve
   7311 /// integrity of liveness analysis. Undef values are also turned into zeroes,
   7312 /// since loOperand() and hiOperand() don't expect Undef input.  Also, in
   7313 /// Non-SFI mode, add a FakeUse(RebasePtr) for every pooled constant operand.
   7314 template <typename TraitsType> void TargetX86Base<TraitsType>::prelowerPhis() {
   7315   if (getFlags().getUseNonsfi()) {
   7316     assert(RebasePtr);
   7317     CfgNode *Node = Context.getNode();
   7318     uint32_t RebasePtrUseCount = 0;
   7319     for (Inst &I : Node->getPhis()) {
   7320       auto *Phi = llvm::dyn_cast<InstPhi>(&I);
   7321       if (Phi->isDeleted())
   7322         continue;
   7323       for (SizeT I = 0; I < Phi->getSrcSize(); ++I) {
   7324         Operand *Src = Phi->getSrc(I);
   7325         // TODO(stichnot): This over-counts for +0.0, and under-counts for other
   7326         // kinds of pooling.
   7327         if (llvm::isa<ConstantRelocatable>(Src) ||
   7328             llvm::isa<ConstantFloat>(Src) || llvm::isa<ConstantDouble>(Src)) {
   7329           ++RebasePtrUseCount;
   7330         }
   7331       }
   7332     }
   7333     if (RebasePtrUseCount) {
   7334       Node->getInsts().push_front(InstFakeUse::create(Func, RebasePtr));
   7335     }
   7336   }
   7337   if (Traits::Is64Bit) {
   7338     // On x86-64 we don't need to prelower phis -- the architecture can handle
   7339     // 64-bit integer natively.
   7340     return;
   7341   }
   7342 
   7343   // Pause constant blinding or pooling, blinding or pooling will be done later
   7344   // during phi lowering assignments
   7345   BoolFlagSaver B(RandomizationPoolingPaused, true);
   7346   PhiLowering::prelowerPhis32Bit<TargetX86Base<TraitsType>>(
   7347       this, Context.getNode(), Func);
   7348 }
   7349 
   7350 template <typename TraitsType>
   7351 void TargetX86Base<TraitsType>::genTargetHelperCallFor(Inst *Instr) {
   7352   uint32_t StackArgumentsSize = 0;
   7353   if (auto *Arith = llvm::dyn_cast<InstArithmetic>(Instr)) {
   7354     RuntimeHelper HelperID = RuntimeHelper::H_Num;
   7355     Variable *Dest = Arith->getDest();
   7356     Type DestTy = Dest->getType();
   7357     if (!Traits::Is64Bit && DestTy == IceType_i64) {
   7358       switch (Arith->getOp()) {
   7359       default:
   7360         return;
   7361       case InstArithmetic::Udiv:
   7362         HelperID = RuntimeHelper::H_udiv_i64;
   7363         break;
   7364       case InstArithmetic::Sdiv:
   7365         HelperID = RuntimeHelper::H_sdiv_i64;
   7366         break;
   7367       case InstArithmetic::Urem:
   7368         HelperID = RuntimeHelper::H_urem_i64;
   7369         break;
   7370       case InstArithmetic::Srem:
   7371         HelperID = RuntimeHelper::H_srem_i64;
   7372         break;
   7373       }
   7374     } else if (isVectorType(DestTy)) {
   7375       Variable *Dest = Arith->getDest();
   7376       Operand *Src0 = Arith->getSrc(0);
   7377       Operand *Src1 = Arith->getSrc(1);
   7378       switch (Arith->getOp()) {
   7379       default:
   7380         return;
   7381       case InstArithmetic::Mul:
   7382         if (DestTy == IceType_v16i8) {
   7383           scalarizeArithmetic(Arith->getOp(), Dest, Src0, Src1);
   7384           Arith->setDeleted();
   7385         }
   7386         return;
   7387       case InstArithmetic::Shl:
   7388       case InstArithmetic::Lshr:
   7389       case InstArithmetic::Ashr:
   7390         if (llvm::isa<Constant>(Src1)) {
   7391           return;
   7392         }
   7393       case InstArithmetic::Udiv:
   7394       case InstArithmetic::Urem:
   7395       case InstArithmetic::Sdiv:
   7396       case InstArithmetic::Srem:
   7397       case InstArithmetic::Frem:
   7398         scalarizeArithmetic(Arith->getOp(), Dest, Src0, Src1);
   7399         Arith->setDeleted();
   7400         return;
   7401       }
   7402     } else {
   7403       switch (Arith->getOp()) {
   7404       default:
   7405         return;
   7406       case InstArithmetic::Frem:
   7407         if (isFloat32Asserting32Or64(DestTy))
   7408           HelperID = RuntimeHelper::H_frem_f32;
   7409         else
   7410           HelperID = RuntimeHelper::H_frem_f64;
   7411       }
   7412     }
   7413     constexpr SizeT MaxSrcs = 2;
   7414     InstCall *Call = makeHelperCall(HelperID, Dest, MaxSrcs);
   7415     Call->addArg(Arith->getSrc(0));
   7416     Call->addArg(Arith->getSrc(1));
   7417     StackArgumentsSize = getCallStackArgumentsSizeBytes(Call);
   7418     Context.insert(Call);
   7419     Arith->setDeleted();
   7420   } else if (auto *Cast = llvm::dyn_cast<InstCast>(Instr)) {
   7421     InstCast::OpKind CastKind = Cast->getCastKind();
   7422     Operand *Src0 = Cast->getSrc(0);
   7423     const Type SrcType = Src0->getType();
   7424     Variable *Dest = Cast->getDest();
   7425     const Type DestTy = Dest->getType();
   7426     RuntimeHelper HelperID = RuntimeHelper::H_Num;
   7427     Variable *CallDest = Dest;
   7428     switch (CastKind) {
   7429     default:
   7430       return;
   7431     case InstCast::Fptosi:
   7432       if (!Traits::Is64Bit && DestTy == IceType_i64) {
   7433         HelperID = isFloat32Asserting32Or64(SrcType)
   7434                        ? RuntimeHelper::H_fptosi_f32_i64
   7435                        : RuntimeHelper::H_fptosi_f64_i64;
   7436       } else {
   7437         return;
   7438       }
   7439       break;
   7440     case InstCast::Fptoui:
   7441       if (isVectorType(DestTy)) {
   7442         assert(DestTy == IceType_v4i32);
   7443         assert(SrcType == IceType_v4f32);
   7444         HelperID = RuntimeHelper::H_fptoui_4xi32_f32;
   7445       } else if (DestTy == IceType_i64 ||
   7446                  (!Traits::Is64Bit && DestTy == IceType_i32)) {
   7447         if (Traits::Is64Bit) {
   7448           HelperID = isFloat32Asserting32Or64(SrcType)
   7449                          ? RuntimeHelper::H_fptoui_f32_i64
   7450                          : RuntimeHelper::H_fptoui_f64_i64;
   7451         } else if (isInt32Asserting32Or64(DestTy)) {
   7452           HelperID = isFloat32Asserting32Or64(SrcType)
   7453                          ? RuntimeHelper::H_fptoui_f32_i32
   7454                          : RuntimeHelper::H_fptoui_f64_i32;
   7455         } else {
   7456           HelperID = isFloat32Asserting32Or64(SrcType)
   7457                          ? RuntimeHelper::H_fptoui_f32_i64
   7458                          : RuntimeHelper::H_fptoui_f64_i64;
   7459         }
   7460       } else {
   7461         return;
   7462       }
   7463       break;
   7464     case InstCast::Sitofp:
   7465       if (!Traits::Is64Bit && SrcType == IceType_i64) {
   7466         HelperID = isFloat32Asserting32Or64(DestTy)
   7467                        ? RuntimeHelper::H_sitofp_i64_f32
   7468                        : RuntimeHelper::H_sitofp_i64_f64;
   7469       } else {
   7470         return;
   7471       }
   7472       break;
   7473     case InstCast::Uitofp:
   7474       if (isVectorType(SrcType)) {
   7475         assert(DestTy == IceType_v4f32);
   7476         assert(SrcType == IceType_v4i32);
   7477         HelperID = RuntimeHelper::H_uitofp_4xi32_4xf32;
   7478       } else if (SrcType == IceType_i64 ||
   7479                  (!Traits::Is64Bit && SrcType == IceType_i32)) {
   7480         if (isInt32Asserting32Or64(SrcType)) {
   7481           HelperID = isFloat32Asserting32Or64(DestTy)
   7482                          ? RuntimeHelper::H_uitofp_i32_f32
   7483                          : RuntimeHelper::H_uitofp_i32_f64;
   7484         } else {
   7485           HelperID = isFloat32Asserting32Or64(DestTy)
   7486                          ? RuntimeHelper::H_uitofp_i64_f32
   7487                          : RuntimeHelper::H_uitofp_i64_f64;
   7488         }
   7489       } else {
   7490         return;
   7491       }
   7492       break;
   7493     case InstCast::Bitcast: {
   7494       if (DestTy == Src0->getType())
   7495         return;
   7496       switch (DestTy) {
   7497       default:
   7498         return;
   7499       case IceType_i8:
   7500         assert(Src0->getType() == IceType_v8i1);
   7501         HelperID = RuntimeHelper::H_bitcast_8xi1_i8;
   7502         CallDest = Func->makeVariable(IceType_i32);
   7503         break;
   7504       case IceType_i16:
   7505         assert(Src0->getType() == IceType_v16i1);
   7506         HelperID = RuntimeHelper::H_bitcast_16xi1_i16;
   7507         CallDest = Func->makeVariable(IceType_i32);
   7508         break;
   7509       case IceType_v8i1: {
   7510         assert(Src0->getType() == IceType_i8);
   7511         HelperID = RuntimeHelper::H_bitcast_i8_8xi1;
   7512         Variable *Src0AsI32 = Func->makeVariable(stackSlotType());
   7513         // Arguments to functions are required to be at least 32 bits wide.
   7514         Context.insert<InstCast>(InstCast::Zext, Src0AsI32, Src0);
   7515         Src0 = Src0AsI32;
   7516       } break;
   7517       case IceType_v16i1: {
   7518         assert(Src0->getType() == IceType_i16);
   7519         HelperID = RuntimeHelper::H_bitcast_i16_16xi1;
   7520         Variable *Src0AsI32 = Func->makeVariable(stackSlotType());
   7521         // Arguments to functions are required to be at least 32 bits wide.
   7522         Context.insert<InstCast>(InstCast::Zext, Src0AsI32, Src0);
   7523         Src0 = Src0AsI32;
   7524       } break;
   7525       }
   7526     } break;
   7527     }
   7528     constexpr SizeT MaxSrcs = 1;
   7529     InstCall *Call = makeHelperCall(HelperID, CallDest, MaxSrcs);
   7530     Call->addArg(Src0);
   7531     StackArgumentsSize = getCallStackArgumentsSizeBytes(Call);
   7532     Context.insert(Call);
   7533     // The PNaCl ABI disallows i8/i16 return types, so truncate the helper call
   7534     // result to the appropriate type as necessary.
   7535     if (CallDest->getType() != Dest->getType())
   7536       Context.insert<InstCast>(InstCast::Trunc, Dest, CallDest);
   7537     Cast->setDeleted();
   7538   } else if (auto *Intrinsic = llvm::dyn_cast<InstIntrinsicCall>(Instr)) {
   7539     CfgVector<Type> ArgTypes;
   7540     Type ReturnType = IceType_void;
   7541     switch (Intrinsics::IntrinsicID ID = Intrinsic->getIntrinsicInfo().ID) {
   7542     default:
   7543       return;
   7544     case Intrinsics::Ctpop: {
   7545       Operand *Val = Intrinsic->getArg(0);
   7546       Type ValTy = Val->getType();
   7547       if (ValTy == IceType_i64)
   7548         ArgTypes = {IceType_i64};
   7549       else
   7550         ArgTypes = {IceType_i32};
   7551       ReturnType = IceType_i32;
   7552     } break;
   7553     case Intrinsics::Longjmp:
   7554       ArgTypes = {IceType_i32, IceType_i32};
   7555       ReturnType = IceType_void;
   7556       break;
   7557     case Intrinsics::Memcpy:
   7558       ArgTypes = {IceType_i32, IceType_i32, IceType_i32};
   7559       ReturnType = IceType_void;
   7560       break;
   7561     case Intrinsics::Memmove:
   7562       ArgTypes = {IceType_i32, IceType_i32, IceType_i32};
   7563       ReturnType = IceType_void;
   7564       break;
   7565     case Intrinsics::Memset:
   7566       ArgTypes = {IceType_i32, IceType_i32, IceType_i32};
   7567       ReturnType = IceType_void;
   7568       break;
   7569     case Intrinsics::NaClReadTP:
   7570       ReturnType = IceType_i32;
   7571       break;
   7572     case Intrinsics::Setjmp:
   7573       ArgTypes = {IceType_i32};
   7574       ReturnType = IceType_i32;
   7575       break;
   7576     }
   7577     StackArgumentsSize = getCallStackArgumentsSizeBytes(ArgTypes, ReturnType);
   7578   } else if (auto *Call = llvm::dyn_cast<InstCall>(Instr)) {
   7579     StackArgumentsSize = getCallStackArgumentsSizeBytes(Call);
   7580   } else if (auto *Ret = llvm::dyn_cast<InstRet>(Instr)) {
   7581     if (!Ret->hasRetValue())
   7582       return;
   7583     Operand *RetValue = Ret->getRetValue();
   7584     Type ReturnType = RetValue->getType();
   7585     if (!isScalarFloatingType(ReturnType))
   7586       return;
   7587     StackArgumentsSize = typeWidthInBytes(ReturnType);
   7588   } else {
   7589     return;
   7590   }
   7591   StackArgumentsSize = Traits::applyStackAlignment(StackArgumentsSize);
   7592   updateMaxOutArgsSizeBytes(StackArgumentsSize);
   7593 }
   7594 
   7595 template <typename TraitsType>
   7596 uint32_t TargetX86Base<TraitsType>::getCallStackArgumentsSizeBytes(
   7597     const CfgVector<Type> &ArgTypes, Type ReturnType) {
   7598   uint32_t OutArgumentsSizeBytes = 0;
   7599   uint32_t XmmArgCount = 0;
   7600   uint32_t GprArgCount = 0;
   7601   for (Type Ty : ArgTypes) {
   7602     // The PNaCl ABI requires the width of arguments to be at least 32 bits.
   7603     assert(typeWidthInBytes(Ty) >= 4);
   7604     if (isVectorType(Ty) && XmmArgCount < Traits::X86_MAX_XMM_ARGS) {
   7605       ++XmmArgCount;
   7606     } else if (isScalarFloatingType(Ty) && Traits::X86_PASS_SCALAR_FP_IN_XMM &&
   7607                XmmArgCount < Traits::X86_MAX_XMM_ARGS) {
   7608       ++XmmArgCount;
   7609     } else if (isScalarIntegerType(Ty) &&
   7610                GprArgCount < Traits::X86_MAX_GPR_ARGS) {
   7611       // The 64 bit ABI allows some integers to be passed in GPRs.
   7612       ++GprArgCount;
   7613     } else {
   7614       if (isVectorType(Ty)) {
   7615         OutArgumentsSizeBytes =
   7616             Traits::applyStackAlignment(OutArgumentsSizeBytes);
   7617       }
   7618       OutArgumentsSizeBytes += typeWidthInBytesOnStack(Ty);
   7619     }
   7620   }
   7621   if (Traits::Is64Bit)
   7622     return OutArgumentsSizeBytes;
   7623   // The 32 bit ABI requires floating point values to be returned on the x87 FP
   7624   // stack. Ensure there is enough space for the fstp/movs for floating returns.
   7625   if (isScalarFloatingType(ReturnType)) {
   7626     OutArgumentsSizeBytes =
   7627         std::max(OutArgumentsSizeBytes,
   7628                  static_cast<uint32_t>(typeWidthInBytesOnStack(ReturnType)));
   7629   }
   7630   return OutArgumentsSizeBytes;
   7631 }
   7632 
   7633 template <typename TraitsType>
   7634 uint32_t TargetX86Base<TraitsType>::getCallStackArgumentsSizeBytes(
   7635     const InstCall *Instr) {
   7636   // Build a vector of the arguments' types.
   7637   const SizeT NumArgs = Instr->getNumArgs();
   7638   CfgVector<Type> ArgTypes;
   7639   ArgTypes.reserve(NumArgs);
   7640   for (SizeT i = 0; i < NumArgs; ++i) {
   7641     Operand *Arg = Instr->getArg(i);
   7642     ArgTypes.emplace_back(Arg->getType());
   7643   }
   7644   // Compute the return type (if any);
   7645   Type ReturnType = IceType_void;
   7646   Variable *Dest = Instr->getDest();
   7647   if (Dest != nullptr)
   7648     ReturnType = Dest->getType();
   7649   return getCallStackArgumentsSizeBytes(ArgTypes, ReturnType);
   7650 }
   7651 
   7652 template <typename TraitsType>
   7653 Variable *TargetX86Base<TraitsType>::makeZeroedRegister(Type Ty,
   7654                                                         RegNumT RegNum) {
   7655   Variable *Reg = makeReg(Ty, RegNum);
   7656   switch (Ty) {
   7657   case IceType_i1:
   7658   case IceType_i8:
   7659   case IceType_i16:
   7660   case IceType_i32:
   7661   case IceType_i64:
   7662     // Conservatively do "mov reg, 0" to avoid modifying FLAGS.
   7663     _mov(Reg, Ctx->getConstantZero(Ty));
   7664     break;
   7665   case IceType_f32:
   7666   case IceType_f64:
   7667     Context.insert<InstFakeDef>(Reg);
   7668     _xorps(Reg, Reg);
   7669     break;
   7670   default:
   7671     // All vector types use the same pxor instruction.
   7672     assert(isVectorType(Ty));
   7673     Context.insert<InstFakeDef>(Reg);
   7674     _pxor(Reg, Reg);
   7675     break;
   7676   }
   7677   return Reg;
   7678 }
   7679 
   7680 // There is no support for loading or emitting vector constants, so the vector
   7681 // values returned from makeVectorOfZeros, makeVectorOfOnes, etc. are
   7682 // initialized with register operations.
   7683 //
   7684 // TODO(wala): Add limited support for vector constants so that complex
   7685 // initialization in registers is unnecessary.
   7686 
   7687 template <typename TraitsType>
   7688 Variable *TargetX86Base<TraitsType>::makeVectorOfZeros(Type Ty,
   7689                                                        RegNumT RegNum) {
   7690   return makeZeroedRegister(Ty, RegNum);
   7691 }
   7692 
   7693 template <typename TraitsType>
   7694 Variable *TargetX86Base<TraitsType>::makeVectorOfMinusOnes(Type Ty,
   7695                                                            RegNumT RegNum) {
   7696   Variable *MinusOnes = makeReg(Ty, RegNum);
   7697   // Insert a FakeDef so the live range of MinusOnes is not overestimated.
   7698   Context.insert<InstFakeDef>(MinusOnes);
   7699   if (Ty == IceType_f64)
   7700     // Making a vector of minus ones of type f64 is currently only used for the
   7701     // fabs intrinsic.  To use the f64 type to create this mask with pcmpeqq
   7702     // requires SSE 4.1.  Since we're just creating a mask, pcmpeqd does the
   7703     // same job and only requires SSE2.
   7704     _pcmpeq(MinusOnes, MinusOnes, IceType_f32);
   7705   else
   7706     _pcmpeq(MinusOnes, MinusOnes);
   7707   return MinusOnes;
   7708 }
   7709 
   7710 template <typename TraitsType>
   7711 Variable *TargetX86Base<TraitsType>::makeVectorOfOnes(Type Ty, RegNumT RegNum) {
   7712   Variable *Dest = makeVectorOfZeros(Ty, RegNum);
   7713   Variable *MinusOne = makeVectorOfMinusOnes(Ty);
   7714   _psub(Dest, MinusOne);
   7715   return Dest;
   7716 }
   7717 
   7718 template <typename TraitsType>
   7719 Variable *TargetX86Base<TraitsType>::makeVectorOfHighOrderBits(Type Ty,
   7720                                                                RegNumT RegNum) {
   7721   assert(Ty == IceType_v4i32 || Ty == IceType_v4f32 || Ty == IceType_v8i16 ||
   7722          Ty == IceType_v16i8);
   7723   if (Ty == IceType_v4f32 || Ty == IceType_v4i32 || Ty == IceType_v8i16) {
   7724     Variable *Reg = makeVectorOfOnes(Ty, RegNum);
   7725     SizeT Shift =
   7726         typeWidthInBytes(typeElementType(Ty)) * Traits::X86_CHAR_BIT - 1;
   7727     _psll(Reg, Ctx->getConstantInt8(Shift));
   7728     return Reg;
   7729   } else {
   7730     // SSE has no left shift operation for vectors of 8 bit integers.
   7731     constexpr uint32_t HIGH_ORDER_BITS_MASK = 0x80808080;
   7732     Constant *ConstantMask = Ctx->getConstantInt32(HIGH_ORDER_BITS_MASK);
   7733     Variable *Reg = makeReg(Ty, RegNum);
   7734     _movd(Reg, legalize(ConstantMask, Legal_Reg | Legal_Mem));
   7735     _pshufd(Reg, Reg, Ctx->getConstantZero(IceType_i8));
   7736     return Reg;
   7737   }
   7738 }
   7739 
   7740 /// Construct a mask in a register that can be and'ed with a floating-point
   7741 /// value to mask off its sign bit. The value will be <4 x 0x7fffffff> for f32
   7742 /// and v4f32, and <2 x 0x7fffffffffffffff> for f64. Construct it as vector of
   7743 /// ones logically right shifted one bit.
   7744 // TODO(stichnot): Fix the wala
   7745 // TODO: above, to represent vector constants in memory.
   7746 template <typename TraitsType>
   7747 Variable *TargetX86Base<TraitsType>::makeVectorOfFabsMask(Type Ty,
   7748                                                           RegNumT RegNum) {
   7749   Variable *Reg = makeVectorOfMinusOnes(Ty, RegNum);
   7750   _psrl(Reg, Ctx->getConstantInt8(1));
   7751   return Reg;
   7752 }
   7753 
   7754 template <typename TraitsType>
   7755 typename TargetX86Base<TraitsType>::X86OperandMem *
   7756 TargetX86Base<TraitsType>::getMemoryOperandForStackSlot(Type Ty, Variable *Slot,
   7757                                                         uint32_t Offset) {
   7758   // Ensure that Loc is a stack slot.
   7759   assert(Slot->mustNotHaveReg());
   7760   assert(Slot->getRegNum().hasNoValue());
   7761   // Compute the location of Loc in memory.
   7762   // TODO(wala,stichnot): lea should not
   7763   // be required. The address of the stack slot is known at compile time
   7764   // (although not until after addProlog()).
   7765   const Type PointerType = getPointerType();
   7766   Variable *Loc = makeReg(PointerType);
   7767   _lea(Loc, Slot);
   7768   Constant *ConstantOffset = Ctx->getConstantInt32(Offset);
   7769   return X86OperandMem::create(Func, Ty, Loc, ConstantOffset);
   7770 }
   7771 
   7772 /// Lowering helper to copy a scalar integer source operand into some 8-bit GPR.
   7773 /// Src is assumed to already be legalized.  If the source operand is known to
   7774 /// be a memory or immediate operand, a simple mov will suffice.  But if the
   7775 /// source operand can be a physical register, then it must first be copied into
   7776 /// a physical register that is truncable to 8-bit, then truncated into a
   7777 /// physical register that can receive a truncation, and finally copied into the
   7778 /// result 8-bit register (which in general can be any 8-bit register).  For
   7779 /// example, moving %ebp into %ah may be accomplished as:
   7780 ///   movl %ebp, %edx
   7781 ///   mov_trunc %edx, %dl  // this redundant assignment is ultimately elided
   7782 ///   movb %dl, %ah
   7783 /// On the other hand, moving a memory or immediate operand into ah:
   7784 ///   movb 4(%ebp), %ah
   7785 ///   movb $my_imm, %ah
   7786 ///
   7787 /// Note #1.  On a 64-bit target, the "movb 4(%ebp), %ah" is likely not
   7788 /// encodable, so RegNum=Reg_ah should NOT be given as an argument.  Instead,
   7789 /// use RegNum=RegNumT() and then let the caller do a separate copy into
   7790 /// Reg_ah.
   7791 ///
   7792 /// Note #2.  ConstantRelocatable operands are also put through this process
   7793 /// (not truncated directly) because our ELF emitter does R_386_32 relocations
   7794 /// but not R_386_8 relocations.
   7795 ///
   7796 /// Note #3.  If Src is a Variable, the result will be an infinite-weight i8
   7797 /// Variable with the RCX86_IsTrunc8Rcvr register class.  As such, this helper
   7798 /// is a convenient way to prevent ah/bh/ch/dh from being an (invalid) argument
   7799 /// to the pinsrb instruction.
   7800 template <typename TraitsType>
   7801 Variable *TargetX86Base<TraitsType>::copyToReg8(Operand *Src, RegNumT RegNum) {
   7802   Type Ty = Src->getType();
   7803   assert(isScalarIntegerType(Ty));
   7804   assert(Ty != IceType_i1);
   7805   Variable *Reg = makeReg(IceType_i8, RegNum);
   7806   Reg->setRegClass(RCX86_IsTrunc8Rcvr);
   7807   if (llvm::isa<Variable>(Src) || llvm::isa<ConstantRelocatable>(Src)) {
   7808     Variable *SrcTruncable = makeReg(Ty);
   7809     switch (Ty) {
   7810     case IceType_i64:
   7811       SrcTruncable->setRegClass(RCX86_Is64To8);
   7812       break;
   7813     case IceType_i32:
   7814       SrcTruncable->setRegClass(RCX86_Is32To8);
   7815       break;
   7816     case IceType_i16:
   7817       SrcTruncable->setRegClass(RCX86_Is16To8);
   7818       break;
   7819     default:
   7820       // i8 - just use default register class
   7821       break;
   7822     }
   7823     Variable *SrcRcvr = makeReg(IceType_i8);
   7824     SrcRcvr->setRegClass(RCX86_IsTrunc8Rcvr);
   7825     _mov(SrcTruncable, Src);
   7826     _mov(SrcRcvr, SrcTruncable);
   7827     Src = SrcRcvr;
   7828   }
   7829   _mov(Reg, Src);
   7830   return Reg;
   7831 }
   7832 
   7833 /// Helper for legalize() to emit the right code to lower an operand to a
   7834 /// register of the appropriate type.
   7835 template <typename TraitsType>
   7836 Variable *TargetX86Base<TraitsType>::copyToReg(Operand *Src, RegNumT RegNum) {
   7837   Type Ty = Src->getType();
   7838   Variable *Reg = makeReg(Ty, RegNum);
   7839   if (isVectorType(Ty)) {
   7840     _movp(Reg, Src);
   7841   } else {
   7842     _mov(Reg, Src);
   7843   }
   7844   return Reg;
   7845 }
   7846 
   7847 template <typename TraitsType>
   7848 Operand *TargetX86Base<TraitsType>::legalize(Operand *From, LegalMask Allowed,
   7849                                              RegNumT RegNum) {
   7850   const bool UseNonsfi = getFlags().getUseNonsfi();
   7851   const Type Ty = From->getType();
   7852   // Assert that a physical register is allowed. To date, all calls to
   7853   // legalize() allow a physical register. If a physical register needs to be
   7854   // explicitly disallowed, then new code will need to be written to force a
   7855   // spill.
   7856   assert(Allowed & Legal_Reg);
   7857   // If we're asking for a specific physical register, make sure we're not
   7858   // allowing any other operand kinds. (This could be future work, e.g. allow
   7859   // the shl shift amount to be either an immediate or in ecx.)
   7860   assert(RegNum.hasNoValue() || Allowed == Legal_Reg);
   7861 
   7862   // Substitute with an available infinite-weight variable if possible.  Only do
   7863   // this when we are not asking for a specific register, and when the
   7864   // substitution is not locked to a specific register, and when the types
   7865   // match, in order to capture the vast majority of opportunities and avoid
   7866   // corner cases in the lowering.
   7867   if (RegNum.hasNoValue()) {
   7868     if (Variable *Subst = getContext().availabilityGet(From)) {
   7869       // At this point we know there is a potential substitution available.
   7870       if (Subst->mustHaveReg() && !Subst->hasReg()) {
   7871         // At this point we know the substitution will have a register.
   7872         if (From->getType() == Subst->getType()) {
   7873           // At this point we know the substitution's register is compatible.
   7874           return Subst;
   7875         }
   7876       }
   7877     }
   7878   }
   7879 
   7880   if (auto *Mem = llvm::dyn_cast<X86OperandMem>(From)) {
   7881     // Before doing anything with a Mem operand, we need to ensure that the
   7882     // Base and Index components are in physical registers.
   7883     Variable *Base = Mem->getBase();
   7884     Variable *Index = Mem->getIndex();
   7885     Constant *Offset = Mem->getOffset();
   7886     Variable *RegBase = nullptr;
   7887     Variable *RegIndex = nullptr;
   7888     uint16_t Shift = Mem->getShift();
   7889     if (Base) {
   7890       RegBase = llvm::cast<Variable>(
   7891           legalize(Base, Legal_Reg | Legal_Rematerializable));
   7892     }
   7893     if (Index) {
   7894       // TODO(jpp): perhaps we should only allow Legal_Reg if
   7895       // Base->isRematerializable.
   7896       RegIndex = llvm::cast<Variable>(
   7897           legalize(Index, Legal_Reg | Legal_Rematerializable));
   7898     }
   7899 
   7900     if (Base != RegBase || Index != RegIndex) {
   7901       Mem = X86OperandMem::create(Func, Ty, RegBase, Offset, RegIndex, Shift,
   7902                                   Mem->getSegmentRegister());
   7903     }
   7904 
   7905     // For all Memory Operands, we do randomization/pooling here.
   7906     From = randomizeOrPoolImmediate(Mem);
   7907 
   7908     if (!(Allowed & Legal_Mem)) {
   7909       From = copyToReg(From, RegNum);
   7910     }
   7911     return From;
   7912   }
   7913 
   7914   if (auto *Const = llvm::dyn_cast<Constant>(From)) {
   7915     if (llvm::isa<ConstantUndef>(Const)) {
   7916       From = legalizeUndef(Const, RegNum);
   7917       if (isVectorType(Ty))
   7918         return From;
   7919       Const = llvm::cast<Constant>(From);
   7920     }
   7921     // There should be no constants of vector type (other than undef).
   7922     assert(!isVectorType(Ty));
   7923 
   7924     // If the operand is a 64 bit constant integer we need to legalize it to a
   7925     // register in x86-64.
   7926     if (Traits::Is64Bit) {
   7927       if (auto *C64 = llvm::dyn_cast<ConstantInteger64>(Const)) {
   7928         if (!Utils::IsInt(32, C64->getValue())) {
   7929           if (RegNum.hasValue()) {
   7930             assert(Traits::getGprForType(IceType_i64, RegNum) == RegNum);
   7931           }
   7932           return copyToReg(Const, RegNum);
   7933         }
   7934       }
   7935     }
   7936 
   7937     // If the operand is an 32 bit constant integer, we should check whether we
   7938     // need to randomize it or pool it.
   7939     if (auto *C = llvm::dyn_cast<ConstantInteger32>(Const)) {
   7940       Operand *NewConst = randomizeOrPoolImmediate(C, RegNum);
   7941       if (NewConst != Const) {
   7942         return NewConst;
   7943       }
   7944     }
   7945 
   7946     if (auto *CR = llvm::dyn_cast<ConstantRelocatable>(Const)) {
   7947       // If the operand is a ConstantRelocatable, and Legal_AddrAbs is not
   7948       // specified, and UseNonsfi is indicated, we need to add RebasePtr.
   7949       if (UseNonsfi && !(Allowed & Legal_AddrAbs)) {
   7950         assert(Ty == IceType_i32);
   7951         Variable *NewVar = makeReg(Ty, RegNum);
   7952         auto *Mem = Traits::X86OperandMem::create(Func, Ty, nullptr, CR);
   7953         // LEAs are not automatically sandboxed, thus we explicitly invoke
   7954         // _sandbox_mem_reference.
   7955         _lea(NewVar, _sandbox_mem_reference(Mem));
   7956         From = NewVar;
   7957       }
   7958     } else if (isScalarFloatingType(Ty)) {
   7959       // Convert a scalar floating point constant into an explicit memory
   7960       // operand.
   7961       if (auto *ConstFloat = llvm::dyn_cast<ConstantFloat>(Const)) {
   7962         if (Utils::isPositiveZero(ConstFloat->getValue()))
   7963           return makeZeroedRegister(Ty, RegNum);
   7964       } else if (auto *ConstDouble = llvm::dyn_cast<ConstantDouble>(Const)) {
   7965         if (Utils::isPositiveZero(ConstDouble->getValue()))
   7966           return makeZeroedRegister(Ty, RegNum);
   7967       }
   7968 
   7969       auto *CFrom = llvm::cast<Constant>(From);
   7970       assert(CFrom->getShouldBePooled());
   7971       Constant *Offset = Ctx->getConstantSym(0, CFrom->getLabelName());
   7972       auto *Mem = X86OperandMem::create(Func, Ty, nullptr, Offset);
   7973       From = Mem;
   7974     }
   7975 
   7976     bool NeedsReg = false;
   7977     if (!(Allowed & Legal_Imm) && !isScalarFloatingType(Ty))
   7978       // Immediate specifically not allowed.
   7979       NeedsReg = true;
   7980     if (!(Allowed & Legal_Mem) && isScalarFloatingType(Ty))
   7981       // On x86, FP constants are lowered to mem operands.
   7982       NeedsReg = true;
   7983     if (NeedsReg) {
   7984       From = copyToReg(From, RegNum);
   7985     }
   7986     return From;
   7987   }
   7988 
   7989   if (auto *Var = llvm::dyn_cast<Variable>(From)) {
   7990     // Check if the variable is guaranteed a physical register. This can happen
   7991     // either when the variable is pre-colored or when it is assigned infinite
   7992     // weight.
   7993     bool MustHaveRegister = (Var->hasReg() || Var->mustHaveReg());
   7994     bool MustRematerialize =
   7995         (Var->isRematerializable() && !(Allowed & Legal_Rematerializable));
   7996     // We need a new physical register for the operand if:
   7997     // - Mem is not allowed and Var isn't guaranteed a physical register, or
   7998     // - RegNum is required and Var->getRegNum() doesn't match, or
   7999     // - Var is a rematerializable variable and rematerializable pass-through is
   8000     //   not allowed (in which case we need an lea instruction).
   8001     if (MustRematerialize) {
   8002       assert(Ty == IceType_i32);
   8003       Variable *NewVar = makeReg(Ty, RegNum);
   8004       // Since Var is rematerializable, the offset will be added when the lea is
   8005       // emitted.
   8006       constexpr Constant *NoOffset = nullptr;
   8007       auto *Mem = X86OperandMem::create(Func, Ty, Var, NoOffset);
   8008       _lea(NewVar, Mem);
   8009       From = NewVar;
   8010     } else if ((!(Allowed & Legal_Mem) && !MustHaveRegister) ||
   8011                (RegNum.hasValue() && RegNum != Var->getRegNum())) {
   8012       From = copyToReg(From, RegNum);
   8013     }
   8014     return From;
   8015   }
   8016 
   8017   llvm::report_fatal_error("Unhandled operand kind in legalize()");
   8018   return From;
   8019 }
   8020 
   8021 /// Provide a trivial wrapper to legalize() for this common usage.
   8022 template <typename TraitsType>
   8023 Variable *TargetX86Base<TraitsType>::legalizeToReg(Operand *From,
   8024                                                    RegNumT RegNum) {
   8025   return llvm::cast<Variable>(legalize(From, Legal_Reg, RegNum));
   8026 }
   8027 
   8028 /// Legalize undef values to concrete values.
   8029 template <typename TraitsType>
   8030 Operand *TargetX86Base<TraitsType>::legalizeUndef(Operand *From,
   8031                                                   RegNumT RegNum) {
   8032   Type Ty = From->getType();
   8033   if (llvm::isa<ConstantUndef>(From)) {
   8034     // Lower undefs to zero.  Another option is to lower undefs to an
   8035     // uninitialized register; however, using an uninitialized register results
   8036     // in less predictable code.
   8037     //
   8038     // If in the future the implementation is changed to lower undef values to
   8039     // uninitialized registers, a FakeDef will be needed:
   8040     //     Context.insert<InstFakeDef>(Reg);
   8041     // This is in order to ensure that the live range of Reg is not
   8042     // overestimated.  If the constant being lowered is a 64 bit value, then
   8043     // the result should be split and the lo and hi components will need to go
   8044     // in uninitialized registers.
   8045     if (isVectorType(Ty))
   8046       return makeVectorOfZeros(Ty, RegNum);
   8047     return Ctx->getConstantZero(Ty);
   8048   }
   8049   return From;
   8050 }
   8051 
   8052 /// For the cmp instruction, if Src1 is an immediate, or known to be a physical
   8053 /// register, we can allow Src0 to be a memory operand. Otherwise, Src0 must be
   8054 /// copied into a physical register. (Actually, either Src0 or Src1 can be
   8055 /// chosen for the physical register, but unfortunately we have to commit to one
   8056 /// or the other before register allocation.)
   8057 template <typename TraitsType>
   8058 Operand *TargetX86Base<TraitsType>::legalizeSrc0ForCmp(Operand *Src0,
   8059                                                        Operand *Src1) {
   8060   bool IsSrc1ImmOrReg = false;
   8061   if (llvm::isa<Constant>(Src1)) {
   8062     IsSrc1ImmOrReg = true;
   8063   } else if (auto *Var = llvm::dyn_cast<Variable>(Src1)) {
   8064     if (Var->hasReg())
   8065       IsSrc1ImmOrReg = true;
   8066   }
   8067   return legalize(Src0, IsSrc1ImmOrReg ? (Legal_Reg | Legal_Mem) : Legal_Reg);
   8068 }
   8069 
   8070 template <typename TraitsType>
   8071 typename TargetX86Base<TraitsType>::X86OperandMem *
   8072 TargetX86Base<TraitsType>::formMemoryOperand(Operand *Opnd, Type Ty,
   8073                                              bool DoLegalize) {
   8074   auto *Mem = llvm::dyn_cast<X86OperandMem>(Opnd);
   8075   // It may be the case that address mode optimization already creates an
   8076   // X86OperandMem, so in that case it wouldn't need another level of
   8077   // transformation.
   8078   if (!Mem) {
   8079     auto *Base = llvm::dyn_cast<Variable>(Opnd);
   8080     auto *Offset = llvm::dyn_cast<Constant>(Opnd);
   8081     assert(Base || Offset);
   8082     if (Offset) {
   8083       // During memory operand building, we do not blind or pool the constant
   8084       // offset, we will work on the whole memory operand later as one entity
   8085       // later, this save one instruction. By turning blinding and pooling off,
   8086       // we guarantee legalize(Offset) will return a Constant*.
   8087       if (!llvm::isa<ConstantRelocatable>(Offset)) {
   8088         BoolFlagSaver B(RandomizationPoolingPaused, true);
   8089 
   8090         Offset = llvm::cast<Constant>(legalize(Offset));
   8091       }
   8092 
   8093       assert(llvm::isa<ConstantInteger32>(Offset) ||
   8094              llvm::isa<ConstantRelocatable>(Offset));
   8095     }
   8096     // Not completely sure whether it's OK to leave IsRebased unset when
   8097     // creating the mem operand.  If DoLegalize is true, it will definitely be
   8098     // applied during the legalize() call, but perhaps not during the
   8099     // randomizeOrPoolImmediate() call.  In any case, the emit routines will
   8100     // assert that PIC legalization has been applied.
   8101     Mem = X86OperandMem::create(Func, Ty, Base, Offset);
   8102   }
   8103   // Do legalization, which contains randomization/pooling or do
   8104   // randomization/pooling.
   8105   return llvm::cast<X86OperandMem>(DoLegalize ? legalize(Mem)
   8106                                               : randomizeOrPoolImmediate(Mem));
   8107 }
   8108 
   8109 template <typename TraitsType>
   8110 Variable *TargetX86Base<TraitsType>::makeReg(Type Type, RegNumT RegNum) {
   8111   // There aren't any 64-bit integer registers for x86-32.
   8112   assert(Traits::Is64Bit || Type != IceType_i64);
   8113   Variable *Reg = Func->makeVariable(Type);
   8114   if (RegNum.hasValue())
   8115     Reg->setRegNum(RegNum);
   8116   else
   8117     Reg->setMustHaveReg();
   8118   return Reg;
   8119 }
   8120 
   8121 const Type TypeForSize[] = {IceType_i8, IceType_i16, IceType_i32, IceType_f64,
   8122                             IceType_v16i8};
   8123 
   8124 template <typename TraitsType>
   8125 Type TargetX86Base<TraitsType>::largestTypeInSize(uint32_t Size,
   8126                                                   uint32_t MaxSize) {
   8127   assert(Size != 0);
   8128   uint32_t TyIndex = llvm::findLastSet(Size, llvm::ZB_Undefined);
   8129   uint32_t MaxIndex = MaxSize == NoSizeLimit
   8130                           ? llvm::array_lengthof(TypeForSize) - 1
   8131                           : llvm::findLastSet(MaxSize, llvm::ZB_Undefined);
   8132   return TypeForSize[std::min(TyIndex, MaxIndex)];
   8133 }
   8134 
   8135 template <typename TraitsType>
   8136 Type TargetX86Base<TraitsType>::firstTypeThatFitsSize(uint32_t Size,
   8137                                                       uint32_t MaxSize) {
   8138   assert(Size != 0);
   8139   uint32_t TyIndex = llvm::findLastSet(Size, llvm::ZB_Undefined);
   8140   if (!llvm::isPowerOf2_32(Size))
   8141     ++TyIndex;
   8142   uint32_t MaxIndex = MaxSize == NoSizeLimit
   8143                           ? llvm::array_lengthof(TypeForSize) - 1
   8144                           : llvm::findLastSet(MaxSize, llvm::ZB_Undefined);
   8145   return TypeForSize[std::min(TyIndex, MaxIndex)];
   8146 }
   8147 
   8148 template <typename TraitsType> void TargetX86Base<TraitsType>::postLower() {
   8149   if (Func->getOptLevel() == Opt_m1)
   8150     return;
   8151   markRedefinitions();
   8152   Context.availabilityUpdate();
   8153 }
   8154 
   8155 template <typename TraitsType>
   8156 void TargetX86Base<TraitsType>::makeRandomRegisterPermutation(
   8157     llvm::SmallVectorImpl<RegNumT> &Permutation,
   8158     const SmallBitVector &ExcludeRegisters, uint64_t Salt) const {
   8159   Traits::makeRandomRegisterPermutation(Func, Permutation, ExcludeRegisters,
   8160                                         Salt);
   8161 }
   8162 
   8163 template <typename TraitsType>
   8164 void TargetX86Base<TraitsType>::emit(const ConstantInteger32 *C) const {
   8165   if (!BuildDefs::dump())
   8166     return;
   8167   Ostream &Str = Ctx->getStrEmit();
   8168   Str << "$" << C->getValue();
   8169 }
   8170 
   8171 template <typename TraitsType>
   8172 void TargetX86Base<TraitsType>::emit(const ConstantInteger64 *C) const {
   8173   if (!Traits::Is64Bit) {
   8174     llvm::report_fatal_error("Not expecting to emit 64-bit integers");
   8175   } else {
   8176     if (!BuildDefs::dump())
   8177       return;
   8178     Ostream &Str = Ctx->getStrEmit();
   8179     Str << "$" << C->getValue();
   8180   }
   8181 }
   8182 
   8183 template <typename TraitsType>
   8184 void TargetX86Base<TraitsType>::emit(const ConstantFloat *C) const {
   8185   if (!BuildDefs::dump())
   8186     return;
   8187   Ostream &Str = Ctx->getStrEmit();
   8188   Str << C->getLabelName();
   8189 }
   8190 
   8191 template <typename TraitsType>
   8192 void TargetX86Base<TraitsType>::emit(const ConstantDouble *C) const {
   8193   if (!BuildDefs::dump())
   8194     return;
   8195   Ostream &Str = Ctx->getStrEmit();
   8196   Str << C->getLabelName();
   8197 }
   8198 
   8199 template <typename TraitsType>
   8200 void TargetX86Base<TraitsType>::emit(const ConstantUndef *) const {
   8201   llvm::report_fatal_error("undef value encountered by emitter.");
   8202 }
   8203 
   8204 template <class Machine>
   8205 void TargetX86Base<Machine>::emit(const ConstantRelocatable *C) const {
   8206   if (!BuildDefs::dump())
   8207     return;
   8208   assert(!getFlags().getUseNonsfi() ||
   8209          C->getName().toString() == GlobalOffsetTable);
   8210   Ostream &Str = Ctx->getStrEmit();
   8211   Str << "$";
   8212   emitWithoutPrefix(C);
   8213 }
   8214 
   8215 /// Randomize or pool an Immediate.
   8216 template <typename TraitsType>
   8217 Operand *
   8218 TargetX86Base<TraitsType>::randomizeOrPoolImmediate(Constant *Immediate,
   8219                                                     RegNumT RegNum) {
   8220   assert(llvm::isa<ConstantInteger32>(Immediate) ||
   8221          llvm::isa<ConstantRelocatable>(Immediate));
   8222   if (getFlags().getRandomizeAndPoolImmediatesOption() == RPI_None ||
   8223       RandomizationPoolingPaused == true) {
   8224     // Immediates randomization/pooling off or paused
   8225     return Immediate;
   8226   }
   8227 
   8228   if (Traits::Is64Bit && NeedSandboxing) {
   8229     // Immediate randomization/pooling is currently disabled for x86-64
   8230     // sandboxing for it could generate invalid memory operands.
   8231     assert(false &&
   8232            "Constant pooling/randomization is disabled for x8664 sandbox.");
   8233     return Immediate;
   8234   }
   8235 
   8236   if (!Immediate->shouldBeRandomizedOrPooled()) {
   8237     // the constant Immediate is not eligible for blinding/pooling
   8238     return Immediate;
   8239   }
   8240   Ctx->statsUpdateRPImms();
   8241   switch (getFlags().getRandomizeAndPoolImmediatesOption()) {
   8242   default:
   8243     llvm::report_fatal_error("Unsupported -randomize-pool-immediates option");
   8244   case RPI_Randomize: {
   8245     // blind the constant
   8246     // FROM:
   8247     //  imm
   8248     // TO:
   8249     //  insert: mov imm+cookie, Reg
   8250     //  insert: lea -cookie[Reg], Reg
   8251     //  => Reg
   8252     // If we have already assigned a phy register, we must come from
   8253     // advancedPhiLowering()=>lowerAssign(). In this case we should reuse the
   8254     // assigned register as this assignment is that start of its use-def
   8255     // chain. So we add RegNum argument here. Note we use 'lea' instruction
   8256     // instead of 'xor' to avoid affecting the flags.
   8257     Variable *Reg = makeReg(IceType_i32, RegNum);
   8258     auto *Integer = llvm::cast<ConstantInteger32>(Immediate);
   8259     uint32_t Value = Integer->getValue();
   8260     uint32_t Cookie = Func->getConstantBlindingCookie();
   8261     _mov(Reg, Ctx->getConstantInt(IceType_i32, Cookie + Value));
   8262     Constant *Offset = Ctx->getConstantInt(IceType_i32, 0 - Cookie);
   8263     _lea(Reg, X86OperandMem::create(Func, IceType_i32, Reg, Offset));
   8264     if (Immediate->getType() == IceType_i32) {
   8265       return Reg;
   8266     }
   8267     Variable *TruncReg = makeReg(Immediate->getType(), RegNum);
   8268     _mov(TruncReg, Reg);
   8269     return TruncReg;
   8270   }
   8271   case RPI_Pool: {
   8272     // pool the constant
   8273     // FROM:
   8274     //  imm
   8275     // TO:
   8276     //  insert: mov $label, Reg
   8277     //  => Reg
   8278     assert(getFlags().getRandomizeAndPoolImmediatesOption() == RPI_Pool);
   8279     assert(Immediate->getShouldBePooled());
   8280     // if we have already assigned a phy register, we must come from
   8281     // advancedPhiLowering()=>lowerAssign(). In this case we should reuse the
   8282     // assigned register as this assignment is that start of its use-def
   8283     // chain. So we add RegNum argument here.
   8284     Variable *Reg = makeReg(Immediate->getType(), RegNum);
   8285     constexpr RelocOffsetT Offset = 0;
   8286     Constant *Symbol = Ctx->getConstantSym(Offset, Immediate->getLabelName());
   8287     constexpr Variable *NoBase = nullptr;
   8288     X86OperandMem *MemOperand =
   8289         X86OperandMem::create(Func, Immediate->getType(), NoBase, Symbol);
   8290     _mov(Reg, MemOperand);
   8291     return Reg;
   8292   }
   8293   }
   8294 }
   8295 
   8296 template <typename TraitsType>
   8297 typename TargetX86Base<TraitsType>::X86OperandMem *
   8298 TargetX86Base<TraitsType>::randomizeOrPoolImmediate(X86OperandMem *MemOperand,
   8299                                                     RegNumT RegNum) {
   8300   assert(MemOperand);
   8301   if (getFlags().getRandomizeAndPoolImmediatesOption() == RPI_None ||
   8302       RandomizationPoolingPaused == true) {
   8303     // immediates randomization/pooling is turned off
   8304     return MemOperand;
   8305   }
   8306 
   8307   if (Traits::Is64Bit && NeedSandboxing) {
   8308     // Immediate randomization/pooling is currently disabled for x86-64
   8309     // sandboxing for it could generate invalid memory operands.
   8310     assert(false &&
   8311            "Constant pooling/randomization is disabled for x8664 sandbox.");
   8312     return MemOperand;
   8313   }
   8314 
   8315   // If this memory operand is already a randomized one, we do not randomize it
   8316   // again.
   8317   if (MemOperand->getRandomized())
   8318     return MemOperand;
   8319 
   8320   auto *C = llvm::dyn_cast_or_null<Constant>(MemOperand->getOffset());
   8321 
   8322   if (C == nullptr) {
   8323     return MemOperand;
   8324   }
   8325 
   8326   if (!C->shouldBeRandomizedOrPooled()) {
   8327     return MemOperand;
   8328   }
   8329 
   8330   // The offset of this mem operand should be blinded or pooled
   8331   Ctx->statsUpdateRPImms();
   8332   switch (getFlags().getRandomizeAndPoolImmediatesOption()) {
   8333   default:
   8334     llvm::report_fatal_error("Unsupported -randomize-pool-immediates option");
   8335   case RPI_Randomize: {
   8336     // blind the constant offset
   8337     // FROM:
   8338     //  offset[base, index, shift]
   8339     // TO:
   8340     //  insert: lea offset+cookie[base], RegTemp
   8341     //  => -cookie[RegTemp, index, shift]
   8342     uint32_t Value =
   8343         llvm::dyn_cast<ConstantInteger32>(MemOperand->getOffset())->getValue();
   8344     uint32_t Cookie = Func->getConstantBlindingCookie();
   8345     Constant *Mask1 =
   8346         Ctx->getConstantInt(MemOperand->getOffset()->getType(), Cookie + Value);
   8347     Constant *Mask2 =
   8348         Ctx->getConstantInt(MemOperand->getOffset()->getType(), 0 - Cookie);
   8349 
   8350     X86OperandMem *TempMemOperand = X86OperandMem::create(
   8351         Func, MemOperand->getType(), MemOperand->getBase(), Mask1);
   8352     // If we have already assigned a physical register, we must come from
   8353     // advancedPhiLowering()=>lowerAssign(). In this case we should reuse
   8354     // the assigned register as this assignment is that start of its
   8355     // use-def chain. So we add RegNum argument here.
   8356     Variable *RegTemp = makeReg(MemOperand->getOffset()->getType(), RegNum);
   8357     _lea(RegTemp, TempMemOperand);
   8358 
   8359     X86OperandMem *NewMemOperand = X86OperandMem::create(
   8360         Func, MemOperand->getType(), RegTemp, Mask2, MemOperand->getIndex(),
   8361         MemOperand->getShift(), MemOperand->getSegmentRegister(),
   8362         MemOperand->getIsRebased());
   8363 
   8364     // Label this memory operand as randomized, so we won't randomize it
   8365     // again in case we call legalize() multiple times on this memory
   8366     // operand.
   8367     NewMemOperand->setRandomized(true);
   8368     return NewMemOperand;
   8369   }
   8370   case RPI_Pool: {
   8371     // pool the constant offset
   8372     // FROM:
   8373     //  offset[base, index, shift]
   8374     // TO:
   8375     //  insert: mov $label, RegTemp
   8376     //  insert: lea [base, RegTemp], RegTemp
   8377     //  =>[RegTemp, index, shift]
   8378 
   8379     // Memory operand should never exist as source operands in phi lowering
   8380     // assignments, so there is no need to reuse any registers here. For
   8381     // phi lowering, we should not ask for new physical registers in
   8382     // general. However, if we do meet Memory Operand during phi lowering,
   8383     // we should not blind or pool the immediates for now.
   8384     if (RegNum.hasValue())
   8385       return MemOperand;
   8386     Variable *RegTemp = makeReg(IceType_i32);
   8387     assert(MemOperand->getOffset()->getShouldBePooled());
   8388     constexpr RelocOffsetT SymOffset = 0;
   8389     Constant *Symbol =
   8390         Ctx->getConstantSym(SymOffset, MemOperand->getOffset()->getLabelName());
   8391     constexpr Variable *NoBase = nullptr;
   8392     X86OperandMem *SymbolOperand = X86OperandMem::create(
   8393         Func, MemOperand->getOffset()->getType(), NoBase, Symbol);
   8394     _mov(RegTemp, SymbolOperand);
   8395     // If we have a base variable here, we should add the lea instruction
   8396     // to add the value of the base variable to RegTemp. If there is no
   8397     // base variable, we won't need this lea instruction.
   8398     if (MemOperand->getBase()) {
   8399       X86OperandMem *CalculateOperand = X86OperandMem::create(
   8400           Func, MemOperand->getType(), MemOperand->getBase(), nullptr, RegTemp,
   8401           0, MemOperand->getSegmentRegister());
   8402       _lea(RegTemp, CalculateOperand);
   8403     }
   8404     X86OperandMem *NewMemOperand = X86OperandMem::create(
   8405         Func, MemOperand->getType(), RegTemp, nullptr, MemOperand->getIndex(),
   8406         MemOperand->getShift(), MemOperand->getSegmentRegister());
   8407     return NewMemOperand;
   8408   }
   8409   }
   8410 }
   8411 
   8412 template <typename TraitsType>
   8413 void TargetX86Base<TraitsType>::emitJumpTable(
   8414     const Cfg *, const InstJumpTable *JumpTable) const {
   8415   if (!BuildDefs::dump())
   8416     return;
   8417   Ostream &Str = Ctx->getStrEmit();
   8418   const bool UseNonsfi = getFlags().getUseNonsfi();
   8419   const char *Prefix = UseNonsfi ? ".data.rel.ro." : ".rodata.";
   8420   Str << "\t.section\t" << Prefix << JumpTable->getSectionName()
   8421       << ",\"a\",@progbits\n"
   8422          "\t.align\t" << typeWidthInBytes(getPointerType()) << "\n"
   8423       << JumpTable->getName() << ":";
   8424 
   8425   // On X86 ILP32 pointers are 32-bit hence the use of .long
   8426   for (SizeT I = 0; I < JumpTable->getNumTargets(); ++I)
   8427     Str << "\n\t.long\t" << JumpTable->getTarget(I)->getAsmName();
   8428   Str << "\n";
   8429 }
   8430 
   8431 template <typename TraitsType>
   8432 template <typename T>
   8433 void TargetDataX86<TraitsType>::emitConstantPool(GlobalContext *Ctx) {
   8434   if (!BuildDefs::dump())
   8435     return;
   8436   Ostream &Str = Ctx->getStrEmit();
   8437   Type Ty = T::Ty;
   8438   SizeT Align = typeAlignInBytes(Ty);
   8439   ConstantList Pool = Ctx->getConstantPool(Ty);
   8440 
   8441   Str << "\t.section\t.rodata.cst" << Align << ",\"aM\",@progbits," << Align
   8442       << "\n";
   8443   Str << "\t.align\t" << Align << "\n";
   8444 
   8445   // If reorder-pooled-constants option is set to true, we need to shuffle the
   8446   // constant pool before emitting it.
   8447   if (getFlags().getReorderPooledConstants() && !Pool.empty()) {
   8448     // Use the constant's kind value as the salt for creating random number
   8449     // generator.
   8450     Operand::OperandKind K = (*Pool.begin())->getKind();
   8451     RandomNumberGenerator RNG(getFlags().getRandomSeed(),
   8452                               RPE_PooledConstantReordering, K);
   8453     RandomShuffle(Pool.begin(), Pool.end(),
   8454                   [&RNG](uint64_t N) { return (uint32_t)RNG.next(N); });
   8455   }
   8456 
   8457   for (Constant *C : Pool) {
   8458     if (!C->getShouldBePooled())
   8459       continue;
   8460     auto *Const = llvm::cast<typename T::IceType>(C);
   8461     typename T::IceType::PrimType Value = Const->getValue();
   8462     // Use memcpy() to copy bits from Value into RawValue in a way that avoids
   8463     // breaking strict-aliasing rules.
   8464     typename T::PrimitiveIntType RawValue;
   8465     memcpy(&RawValue, &Value, sizeof(Value));
   8466     char buf[30];
   8467     int CharsPrinted =
   8468         snprintf(buf, llvm::array_lengthof(buf), T::PrintfString, RawValue);
   8469     assert(CharsPrinted >= 0);
   8470     assert((size_t)CharsPrinted < llvm::array_lengthof(buf));
   8471     (void)CharsPrinted; // avoid warnings if asserts are disabled
   8472     Str << Const->getLabelName();
   8473     Str << ":\n\t" << T::AsmTag << "\t" << buf << "\t/* " << T::TypeName << " "
   8474         << Value << " */\n";
   8475   }
   8476 }
   8477 
   8478 template <typename TraitsType>
   8479 void TargetDataX86<TraitsType>::lowerConstants() {
   8480   if (getFlags().getDisableTranslation())
   8481     return;
   8482   switch (getFlags().getOutFileType()) {
   8483   case FT_Elf: {
   8484     ELFObjectWriter *Writer = Ctx->getObjectWriter();
   8485 
   8486     Writer->writeConstantPool<ConstantInteger32>(IceType_i8);
   8487     Writer->writeConstantPool<ConstantInteger32>(IceType_i16);
   8488     Writer->writeConstantPool<ConstantInteger32>(IceType_i32);
   8489 
   8490     Writer->writeConstantPool<ConstantFloat>(IceType_f32);
   8491     Writer->writeConstantPool<ConstantDouble>(IceType_f64);
   8492   } break;
   8493   case FT_Asm:
   8494   case FT_Iasm: {
   8495     OstreamLocker L(Ctx);
   8496 
   8497     emitConstantPool<PoolTypeConverter<uint8_t>>(Ctx);
   8498     emitConstantPool<PoolTypeConverter<uint16_t>>(Ctx);
   8499     emitConstantPool<PoolTypeConverter<uint32_t>>(Ctx);
   8500 
   8501     emitConstantPool<PoolTypeConverter<float>>(Ctx);
   8502     emitConstantPool<PoolTypeConverter<double>>(Ctx);
   8503   } break;
   8504   }
   8505 }
   8506 
   8507 template <typename TraitsType>
   8508 void TargetDataX86<TraitsType>::lowerJumpTables() {
   8509   const bool IsPIC = getFlags().getUseNonsfi();
   8510   switch (getFlags().getOutFileType()) {
   8511   case FT_Elf: {
   8512     ELFObjectWriter *Writer = Ctx->getObjectWriter();
   8513     constexpr FixupKind FK_Abs64 = llvm::ELF::R_X86_64_64;
   8514     const FixupKind RelocationKind =
   8515         (getPointerType() == IceType_i32) ? Traits::FK_Abs : FK_Abs64;
   8516     for (const JumpTableData &JT : Ctx->getJumpTables())
   8517       Writer->writeJumpTable(JT, RelocationKind, IsPIC);
   8518   } break;
   8519   case FT_Asm:
   8520     // Already emitted from Cfg
   8521     break;
   8522   case FT_Iasm: {
   8523     if (!BuildDefs::dump())
   8524       return;
   8525     Ostream &Str = Ctx->getStrEmit();
   8526     const char *Prefix = IsPIC ? ".data.rel.ro." : ".rodata.";
   8527     for (const JumpTableData &JT : Ctx->getJumpTables()) {
   8528       Str << "\t.section\t" << Prefix << JT.getSectionName()
   8529           << ",\"a\",@progbits\n"
   8530              "\t.align\t" << typeWidthInBytes(getPointerType()) << "\n"
   8531           << JT.getName().toString() << ":";
   8532 
   8533       // On X8664 ILP32 pointers are 32-bit hence the use of .long
   8534       for (intptr_t TargetOffset : JT.getTargetOffsets())
   8535         Str << "\n\t.long\t" << JT.getFunctionName() << "+" << TargetOffset;
   8536       Str << "\n";
   8537     }
   8538   } break;
   8539   }
   8540 }
   8541 
   8542 template <typename TraitsType>
   8543 void TargetDataX86<TraitsType>::lowerGlobals(
   8544     const VariableDeclarationList &Vars, const std::string &SectionSuffix) {
   8545   const bool IsPIC = getFlags().getUseNonsfi();
   8546   switch (getFlags().getOutFileType()) {
   8547   case FT_Elf: {
   8548     ELFObjectWriter *Writer = Ctx->getObjectWriter();
   8549     Writer->writeDataSection(Vars, Traits::FK_Abs, SectionSuffix, IsPIC);
   8550   } break;
   8551   case FT_Asm:
   8552   case FT_Iasm: {
   8553     OstreamLocker L(Ctx);
   8554     for (const VariableDeclaration *Var : Vars) {
   8555       if (getFlags().matchTranslateOnly(Var->getName(), 0)) {
   8556         emitGlobal(*Var, SectionSuffix);
   8557       }
   8558     }
   8559   } break;
   8560   }
   8561 }
   8562 } // end of namespace X86NAMESPACE
   8563 } // end of namespace Ice
   8564 
   8565 #endif // SUBZERO_SRC_ICETARGETLOWERINGX86BASEIMPL_H
   8566