Home | History | Annotate | Download | only in src
      1 //===- subzero/src/IceTargetLoweringARM32.cpp - ARM32 lowering ------------===//
      2 //
      3 //                        The Subzero Code Generator
      4 //
      5 // This file is distributed under the University of Illinois Open Source
      6 // License. See LICENSE.TXT for details.
      7 //
      8 //===----------------------------------------------------------------------===//
      9 ///
     10 /// \file
     11 /// \brief Implements the TargetLoweringARM32 class, which consists almost
     12 /// entirely of the lowering sequence for each high-level instruction.
     13 ///
     14 //===----------------------------------------------------------------------===//
     15 #include "IceTargetLoweringARM32.h"
     16 
     17 #include "IceCfg.h"
     18 #include "IceCfgNode.h"
     19 #include "IceClFlags.h"
     20 #include "IceDefs.h"
     21 #include "IceELFObjectWriter.h"
     22 #include "IceGlobalInits.h"
     23 #include "IceInstARM32.def"
     24 #include "IceInstARM32.h"
     25 #include "IceInstVarIter.h"
     26 #include "IceLiveness.h"
     27 #include "IceOperand.h"
     28 #include "IcePhiLoweringImpl.h"
     29 #include "IceRegistersARM32.h"
     30 #include "IceTargetLoweringARM32.def"
     31 #include "IceUtils.h"
     32 #include "llvm/Support/MathExtras.h"
     33 
     34 #include <algorithm>
     35 #include <array>
     36 #include <utility>
     37 
     38 namespace ARM32 {
     39 std::unique_ptr<::Ice::TargetLowering> createTargetLowering(::Ice::Cfg *Func) {
     40   return ::Ice::ARM32::TargetARM32::create(Func);
     41 }
     42 
     43 std::unique_ptr<::Ice::TargetDataLowering>
     44 createTargetDataLowering(::Ice::GlobalContext *Ctx) {
     45   return ::Ice::ARM32::TargetDataARM32::create(Ctx);
     46 }
     47 
     48 std::unique_ptr<::Ice::TargetHeaderLowering>
     49 createTargetHeaderLowering(::Ice::GlobalContext *Ctx) {
     50   return ::Ice::ARM32::TargetHeaderARM32::create(Ctx);
     51 }
     52 
     53 void staticInit(::Ice::GlobalContext *Ctx) {
     54   ::Ice::ARM32::TargetARM32::staticInit(Ctx);
     55   if (Ice::getFlags().getUseNonsfi()) {
     56     // In nonsfi, we need to reference the _GLOBAL_OFFSET_TABLE_ for accessing
     57     // globals. The GOT is an external symbol (i.e., it is not defined in the
     58     // pexe) so we need to register it as such so that ELF emission won't barf
     59     // on an "unknown" symbol. The GOT is added to the External symbols list
     60     // here because staticInit() is invoked in a single-thread context.
     61     Ctx->getConstantExternSym(Ctx->getGlobalString(::Ice::GlobalOffsetTable));
     62   }
     63 }
     64 
     65 bool shouldBePooled(const ::Ice::Constant *C) {
     66   return ::Ice::ARM32::TargetARM32::shouldBePooled(C);
     67 }
     68 
     69 ::Ice::Type getPointerType() {
     70   return ::Ice::ARM32::TargetARM32::getPointerType();
     71 }
     72 
     73 } // end of namespace ARM32
     74 
     75 namespace Ice {
     76 namespace ARM32 {
     77 
     78 namespace {
     79 
     80 /// SizeOf is used to obtain the size of an initializer list as a constexpr
     81 /// expression. This is only needed until our C++ library is updated to
     82 /// C++ 14 -- which defines constexpr members to std::initializer_list.
     83 class SizeOf {
     84   SizeOf(const SizeOf &) = delete;
     85   SizeOf &operator=(const SizeOf &) = delete;
     86 
     87 public:
     88   constexpr SizeOf() : Size(0) {}
     89   template <typename... T>
     90   explicit constexpr SizeOf(T...)
     91       : Size(__length<T...>::value) {}
     92   constexpr SizeT size() const { return Size; }
     93 
     94 private:
     95   template <typename T, typename... U> struct __length {
     96     static constexpr std::size_t value = 1 + __length<U...>::value;
     97   };
     98 
     99   template <typename T> struct __length<T> {
    100     static constexpr std::size_t value = 1;
    101   };
    102 
    103   const std::size_t Size;
    104 };
    105 
    106 } // end of anonymous namespace
    107 
    108 // Defines the RegARM32::Table table with register information.
    109 RegARM32::RegTableType RegARM32::RegTable[RegARM32::Reg_NUM] = {
    110 #define X(val, encode, name, cc_arg, scratch, preserved, stackptr, frameptr,   \
    111           isGPR, isInt, isI64Pair, isFP32, isFP64, isVec128, alias_init)       \
    112   {                                                                            \
    113     name, encode, cc_arg, scratch, preserved, stackptr, frameptr, isGPR,       \
    114         isInt, isI64Pair, isFP32, isFP64, isVec128,                            \
    115         (SizeOf alias_init).size(), alias_init                                 \
    116   }                                                                            \
    117   ,
    118     REGARM32_TABLE
    119 #undef X
    120 };
    121 
    122 namespace {
    123 
    124 // The following table summarizes the logic for lowering the icmp instruction
    125 // for i32 and narrower types. Each icmp condition has a clear mapping to an
    126 // ARM32 conditional move instruction.
    127 
    128 const struct TableIcmp32_ {
    129   CondARM32::Cond Mapping;
    130 } TableIcmp32[] = {
    131 #define X(val, is_signed, swapped64, C_32, C1_64, C2_64, C_V, INV_V, NEG_V)    \
    132   { CondARM32::C_32 }                                                          \
    133   ,
    134     ICMPARM32_TABLE
    135 #undef X
    136 };
    137 
    138 // The following table summarizes the logic for lowering the icmp instruction
    139 // for the i64 type. Two conditional moves are needed for setting to 1 or 0.
    140 // The operands may need to be swapped, and there is a slight difference for
    141 // signed vs unsigned (comparing hi vs lo first, and using cmp vs sbc).
    142 const struct TableIcmp64_ {
    143   bool IsSigned;
    144   bool Swapped;
    145   CondARM32::Cond C1, C2;
    146 } TableIcmp64[] = {
    147 #define X(val, is_signed, swapped64, C_32, C1_64, C2_64, C_V, INV_V, NEG_V)    \
    148   { is_signed, swapped64, CondARM32::C1_64, CondARM32::C2_64 }                 \
    149   ,
    150     ICMPARM32_TABLE
    151 #undef X
    152 };
    153 
    154 CondARM32::Cond getIcmp32Mapping(InstIcmp::ICond Cond) {
    155   assert(Cond < llvm::array_lengthof(TableIcmp32));
    156   return TableIcmp32[Cond].Mapping;
    157 }
    158 
    159 // In some cases, there are x-macros tables for both high-level and low-level
    160 // instructions/operands that use the same enum key value. The tables are kept
    161 // separate to maintain a proper separation between abstraction layers. There
    162 // is a risk that the tables could get out of sync if enum values are reordered
    163 // or if entries are added or deleted. The following anonymous namespaces use
    164 // static_asserts to ensure everything is kept in sync.
    165 
    166 // Validate the enum values in ICMPARM32_TABLE.
    167 namespace {
    168 // Define a temporary set of enum values based on low-level table entries.
    169 enum _icmp_ll_enum {
    170 #define X(val, is_signed, swapped64, C_32, C1_64, C2_64, C_V, INV_V, NEG_V)    \
    171   _icmp_ll_##val,
    172   ICMPARM32_TABLE
    173 #undef X
    174       _num
    175 };
    176 // Define a set of constants based on high-level table entries.
    177 #define X(tag, reverse, str)                                                   \
    178   static constexpr int _icmp_hl_##tag = InstIcmp::tag;
    179 ICEINSTICMP_TABLE
    180 #undef X
    181 // Define a set of constants based on low-level table entries, and ensure the
    182 // table entry keys are consistent.
    183 #define X(val, is_signed, swapped64, C_32, C1_64, C2_64, C_V, INV_V, NEG_V)    \
    184   static_assert(                                                               \
    185       _icmp_ll_##val == _icmp_hl_##val,                                        \
    186       "Inconsistency between ICMPARM32_TABLE and ICEINSTICMP_TABLE: " #val);
    187 ICMPARM32_TABLE
    188 #undef X
    189 // Repeat the static asserts with respect to the high-level table entries in
    190 // case the high-level table has extra entries.
    191 #define X(tag, reverse, str)                                                   \
    192   static_assert(                                                               \
    193       _icmp_hl_##tag == _icmp_ll_##tag,                                        \
    194       "Inconsistency between ICMPARM32_TABLE and ICEINSTICMP_TABLE: " #tag);
    195 ICEINSTICMP_TABLE
    196 #undef X
    197 } // end of anonymous namespace
    198 
    199 // Stack alignment
    200 const uint32_t ARM32_STACK_ALIGNMENT_BYTES = 16;
    201 
    202 // Value is in bytes. Return Value adjusted to the next highest multiple of the
    203 // stack alignment.
    204 uint32_t applyStackAlignment(uint32_t Value) {
    205   return Utils::applyAlignment(Value, ARM32_STACK_ALIGNMENT_BYTES);
    206 }
    207 
    208 // Value is in bytes. Return Value adjusted to the next highest multiple of the
    209 // stack alignment required for the given type.
    210 uint32_t applyStackAlignmentTy(uint32_t Value, Type Ty) {
    211   // Use natural alignment, except that normally (non-NaCl) ARM only aligns
    212   // vectors to 8 bytes.
    213   // TODO(jvoung): Check this ...
    214   size_t typeAlignInBytes = typeWidthInBytes(Ty);
    215   if (isVectorType(Ty))
    216     typeAlignInBytes = 8;
    217   return Utils::applyAlignment(Value, typeAlignInBytes);
    218 }
    219 
    220 // Conservatively check if at compile time we know that the operand is
    221 // definitely a non-zero integer.
    222 bool isGuaranteedNonzeroInt(const Operand *Op) {
    223   if (auto *Const = llvm::dyn_cast_or_null<ConstantInteger32>(Op)) {
    224     return Const->getValue() != 0;
    225   }
    226   return false;
    227 }
    228 
    229 } // end of anonymous namespace
    230 
    231 TargetARM32Features::TargetARM32Features(const ClFlags &Flags) {
    232   static_assert(
    233       (ARM32InstructionSet::End - ARM32InstructionSet::Begin) ==
    234           (TargetInstructionSet::ARM32InstructionSet_End -
    235            TargetInstructionSet::ARM32InstructionSet_Begin),
    236       "ARM32InstructionSet range different from TargetInstructionSet");
    237   if (Flags.getTargetInstructionSet() !=
    238       TargetInstructionSet::BaseInstructionSet) {
    239     InstructionSet = static_cast<ARM32InstructionSet>(
    240         (Flags.getTargetInstructionSet() -
    241          TargetInstructionSet::ARM32InstructionSet_Begin) +
    242         ARM32InstructionSet::Begin);
    243   }
    244 }
    245 
    246 namespace {
    247 constexpr SizeT NumGPRArgs =
    248 #define X(val, encode, name, cc_arg, scratch, preserved, stackptr, frameptr,   \
    249           isGPR, isInt, isI64Pair, isFP32, isFP64, isVec128, alias_init)       \
    250   +(((cc_arg) > 0) ? 1 : 0)
    251     REGARM32_GPR_TABLE
    252 #undef X
    253     ;
    254 std::array<RegNumT, NumGPRArgs> GPRArgInitializer;
    255 
    256 constexpr SizeT NumI64Args =
    257 #define X(val, encode, name, cc_arg, scratch, preserved, stackptr, frameptr,   \
    258           isGPR, isInt, isI64Pair, isFP32, isFP64, isVec128, alias_init)       \
    259   +(((cc_arg) > 0) ? 1 : 0)
    260     REGARM32_I64PAIR_TABLE
    261 #undef X
    262     ;
    263 std::array<RegNumT, NumI64Args> I64ArgInitializer;
    264 
    265 constexpr SizeT NumFP32Args =
    266 #define X(val, encode, name, cc_arg, scratch, preserved, stackptr, frameptr,   \
    267           isGPR, isInt, isI64Pair, isFP32, isFP64, isVec128, alias_init)       \
    268   +(((cc_arg) > 0) ? 1 : 0)
    269     REGARM32_FP32_TABLE
    270 #undef X
    271     ;
    272 std::array<RegNumT, NumFP32Args> FP32ArgInitializer;
    273 
    274 constexpr SizeT NumFP64Args =
    275 #define X(val, encode, name, cc_arg, scratch, preserved, stackptr, frameptr,   \
    276           isGPR, isInt, isI64Pair, isFP32, isFP64, isVec128, alias_init)       \
    277   +(((cc_arg) > 0) ? 1 : 0)
    278     REGARM32_FP64_TABLE
    279 #undef X
    280     ;
    281 std::array<RegNumT, NumFP64Args> FP64ArgInitializer;
    282 
    283 constexpr SizeT NumVec128Args =
    284 #define X(val, encode, name, cc_arg, scratch, preserved, stackptr, frameptr,   \
    285           isGPR, isInt, isI64Pair, isFP32, isFP64, isVec128, alias_init)       \
    286   +(((cc_arg > 0)) ? 1 : 0)
    287     REGARM32_VEC128_TABLE
    288 #undef X
    289     ;
    290 std::array<RegNumT, NumVec128Args> Vec128ArgInitializer;
    291 
    292 const char *getRegClassName(RegClass C) {
    293   auto ClassNum = static_cast<RegARM32::RegClassARM32>(C);
    294   assert(ClassNum < RegARM32::RCARM32_NUM);
    295   switch (ClassNum) {
    296   default:
    297     assert(C < RC_Target);
    298     return regClassString(C);
    299   // Add handling of new register classes below.
    300   case RegARM32::RCARM32_QtoS:
    301     return "QtoS";
    302   }
    303 }
    304 
    305 } // end of anonymous namespace
    306 
    307 TargetARM32::TargetARM32(Cfg *Func)
    308     : TargetLowering(Func), NeedSandboxing(SandboxingType == ST_NaCl),
    309       CPUFeatures(getFlags()) {}
    310 
    311 void TargetARM32::staticInit(GlobalContext *Ctx) {
    312   RegNumT::setLimit(RegARM32::Reg_NUM);
    313   // Limit this size (or do all bitsets need to be the same width)???
    314   SmallBitVector IntegerRegisters(RegARM32::Reg_NUM);
    315   SmallBitVector I64PairRegisters(RegARM32::Reg_NUM);
    316   SmallBitVector Float32Registers(RegARM32::Reg_NUM);
    317   SmallBitVector Float64Registers(RegARM32::Reg_NUM);
    318   SmallBitVector VectorRegisters(RegARM32::Reg_NUM);
    319   SmallBitVector QtoSRegisters(RegARM32::Reg_NUM);
    320   SmallBitVector InvalidRegisters(RegARM32::Reg_NUM);
    321   const unsigned EncodedReg_q8 = RegARM32::RegTable[RegARM32::Reg_q8].Encoding;
    322   for (int i = 0; i < RegARM32::Reg_NUM; ++i) {
    323     const auto &Entry = RegARM32::RegTable[i];
    324     IntegerRegisters[i] = Entry.IsInt;
    325     I64PairRegisters[i] = Entry.IsI64Pair;
    326     Float32Registers[i] = Entry.IsFP32;
    327     Float64Registers[i] = Entry.IsFP64;
    328     VectorRegisters[i] = Entry.IsVec128;
    329     RegisterAliases[i].resize(RegARM32::Reg_NUM);
    330     // TODO(eholk): It would be better to store a QtoS flag in the
    331     // IceRegistersARM32 table than to compare their encodings here.
    332     QtoSRegisters[i] = Entry.IsVec128 && Entry.Encoding < EncodedReg_q8;
    333     for (int j = 0; j < Entry.NumAliases; ++j) {
    334       assert(i == j || !RegisterAliases[i][Entry.Aliases[j]]);
    335       RegisterAliases[i].set(Entry.Aliases[j]);
    336     }
    337     assert(RegisterAliases[i][i]);
    338     if (Entry.CCArg <= 0) {
    339       continue;
    340     }
    341     const auto RegNum = RegNumT::fromInt(i);
    342     if (Entry.IsGPR) {
    343       GPRArgInitializer[Entry.CCArg - 1] = RegNum;
    344     } else if (Entry.IsI64Pair) {
    345       I64ArgInitializer[Entry.CCArg - 1] = RegNum;
    346     } else if (Entry.IsFP32) {
    347       FP32ArgInitializer[Entry.CCArg - 1] = RegNum;
    348     } else if (Entry.IsFP64) {
    349       FP64ArgInitializer[Entry.CCArg - 1] = RegNum;
    350     } else if (Entry.IsVec128) {
    351       Vec128ArgInitializer[Entry.CCArg - 1] = RegNum;
    352     }
    353   }
    354   TypeToRegisterSet[IceType_void] = InvalidRegisters;
    355   TypeToRegisterSet[IceType_i1] = IntegerRegisters;
    356   TypeToRegisterSet[IceType_i8] = IntegerRegisters;
    357   TypeToRegisterSet[IceType_i16] = IntegerRegisters;
    358   TypeToRegisterSet[IceType_i32] = IntegerRegisters;
    359   TypeToRegisterSet[IceType_i64] = I64PairRegisters;
    360   TypeToRegisterSet[IceType_f32] = Float32Registers;
    361   TypeToRegisterSet[IceType_f64] = Float64Registers;
    362   TypeToRegisterSet[IceType_v4i1] = VectorRegisters;
    363   TypeToRegisterSet[IceType_v8i1] = VectorRegisters;
    364   TypeToRegisterSet[IceType_v16i1] = VectorRegisters;
    365   TypeToRegisterSet[IceType_v16i8] = VectorRegisters;
    366   TypeToRegisterSet[IceType_v8i16] = VectorRegisters;
    367   TypeToRegisterSet[IceType_v4i32] = VectorRegisters;
    368   TypeToRegisterSet[IceType_v4f32] = VectorRegisters;
    369   TypeToRegisterSet[RegARM32::RCARM32_QtoS] = QtoSRegisters;
    370 
    371   for (size_t i = 0; i < llvm::array_lengthof(TypeToRegisterSet); ++i)
    372     TypeToRegisterSetUnfiltered[i] = TypeToRegisterSet[i];
    373 
    374   filterTypeToRegisterSet(Ctx, RegARM32::Reg_NUM, TypeToRegisterSet,
    375                           llvm::array_lengthof(TypeToRegisterSet),
    376                           [](RegNumT RegNum) -> std::string {
    377                             // This function simply removes ", " from the
    378                             // register name.
    379                             std::string Name = RegARM32::getRegName(RegNum);
    380                             constexpr const char RegSeparator[] = ", ";
    381                             constexpr size_t RegSeparatorWidth =
    382                                 llvm::array_lengthof(RegSeparator) - 1;
    383                             for (size_t Pos = Name.find(RegSeparator);
    384                                  Pos != std::string::npos;
    385                                  Pos = Name.find(RegSeparator)) {
    386                               Name.replace(Pos, RegSeparatorWidth, "");
    387                             }
    388                             return Name;
    389                           },
    390                           getRegClassName);
    391 }
    392 
    393 namespace {
    394 void copyRegAllocFromInfWeightVariable64On32(const VarList &Vars) {
    395   for (Variable *Var : Vars) {
    396     auto *Var64 = llvm::dyn_cast<Variable64On32>(Var);
    397     if (!Var64) {
    398       // This is not the variable we are looking for.
    399       continue;
    400     }
    401     // only allow infinite-weight i64 temporaries to be register allocated.
    402     assert(!Var64->hasReg() || Var64->mustHaveReg());
    403     if (!Var64->hasReg()) {
    404       continue;
    405     }
    406     const auto FirstReg =
    407         RegNumT::fixme(RegARM32::getI64PairFirstGPRNum(Var->getRegNum()));
    408     // This assumes little endian.
    409     Variable *Lo = Var64->getLo();
    410     Variable *Hi = Var64->getHi();
    411     assert(Lo->hasReg() == Hi->hasReg());
    412     if (Lo->hasReg()) {
    413       continue;
    414     }
    415     Lo->setRegNum(FirstReg);
    416     Lo->setMustHaveReg();
    417     Hi->setRegNum(RegNumT::fixme(FirstReg + 1));
    418     Hi->setMustHaveReg();
    419   }
    420 }
    421 } // end of anonymous namespace
    422 
    423 uint32_t TargetARM32::getCallStackArgumentsSizeBytes(const InstCall *Call) {
    424   TargetARM32::CallingConv CC;
    425   RegNumT DummyReg;
    426   size_t OutArgsSizeBytes = 0;
    427   for (SizeT i = 0, NumArgs = Call->getNumArgs(); i < NumArgs; ++i) {
    428     Operand *Arg = legalizeUndef(Call->getArg(i));
    429     const Type Ty = Arg->getType();
    430     if (isScalarIntegerType(Ty)) {
    431       if (CC.argInGPR(Ty, &DummyReg)) {
    432         continue;
    433       }
    434     } else {
    435       if (CC.argInVFP(Ty, &DummyReg)) {
    436         continue;
    437       }
    438     }
    439 
    440     OutArgsSizeBytes = applyStackAlignmentTy(OutArgsSizeBytes, Ty);
    441     OutArgsSizeBytes += typeWidthInBytesOnStack(Ty);
    442   }
    443 
    444   return applyStackAlignment(OutArgsSizeBytes);
    445 }
    446 
    447 void TargetARM32::genTargetHelperCallFor(Inst *Instr) {
    448   constexpr bool NoTailCall = false;
    449   constexpr bool IsTargetHelperCall = true;
    450 
    451   switch (Instr->getKind()) {
    452   default:
    453     return;
    454   case Inst::Arithmetic: {
    455     Variable *Dest = Instr->getDest();
    456     const Type DestTy = Dest->getType();
    457     const InstArithmetic::OpKind Op =
    458         llvm::cast<InstArithmetic>(Instr)->getOp();
    459     if (isVectorType(DestTy)) {
    460       switch (Op) {
    461       default:
    462         break;
    463       case InstArithmetic::Fdiv:
    464       case InstArithmetic::Frem:
    465       case InstArithmetic::Sdiv:
    466       case InstArithmetic::Srem:
    467       case InstArithmetic::Udiv:
    468       case InstArithmetic::Urem:
    469         scalarizeArithmetic(Op, Dest, Instr->getSrc(0), Instr->getSrc(1));
    470         Instr->setDeleted();
    471         return;
    472       }
    473     }
    474     switch (DestTy) {
    475     default:
    476       return;
    477     case IceType_i64: {
    478       // Technically, ARM has its own aeabi routines, but we can use the
    479       // non-aeabi routine as well. LLVM uses __aeabi_ldivmod for div, but uses
    480       // the more standard __moddi3 for rem.
    481       RuntimeHelper HelperID = RuntimeHelper::H_Num;
    482       switch (Op) {
    483       default:
    484         return;
    485       case InstArithmetic::Udiv:
    486         HelperID = RuntimeHelper::H_udiv_i64;
    487         break;
    488       case InstArithmetic::Sdiv:
    489         HelperID = RuntimeHelper::H_sdiv_i64;
    490         break;
    491       case InstArithmetic::Urem:
    492         HelperID = RuntimeHelper::H_urem_i64;
    493         break;
    494       case InstArithmetic::Srem:
    495         HelperID = RuntimeHelper::H_srem_i64;
    496         break;
    497       }
    498       Operand *TargetHelper = Ctx->getRuntimeHelperFunc(HelperID);
    499       ARM32HelpersPreamble[TargetHelper] = &TargetARM32::preambleDivRem;
    500       constexpr SizeT MaxArgs = 2;
    501       auto *Call = Context.insert<InstCall>(MaxArgs, Dest, TargetHelper,
    502                                             NoTailCall, IsTargetHelperCall);
    503       Call->addArg(Instr->getSrc(0));
    504       Call->addArg(Instr->getSrc(1));
    505       Instr->setDeleted();
    506       return;
    507     }
    508     case IceType_i32:
    509     case IceType_i16:
    510     case IceType_i8: {
    511       const bool HasHWDiv = hasCPUFeature(TargetARM32Features::HWDivArm);
    512       InstCast::OpKind CastKind;
    513       RuntimeHelper HelperID = RuntimeHelper::H_Num;
    514       switch (Op) {
    515       default:
    516         return;
    517       case InstArithmetic::Udiv:
    518         HelperID = HasHWDiv ? RuntimeHelper::H_Num : RuntimeHelper::H_udiv_i32;
    519         CastKind = InstCast::Zext;
    520         break;
    521       case InstArithmetic::Sdiv:
    522         HelperID = HasHWDiv ? RuntimeHelper::H_Num : RuntimeHelper::H_sdiv_i32;
    523         CastKind = InstCast::Sext;
    524         break;
    525       case InstArithmetic::Urem:
    526         HelperID = HasHWDiv ? RuntimeHelper::H_Num : RuntimeHelper::H_urem_i32;
    527         CastKind = InstCast::Zext;
    528         break;
    529       case InstArithmetic::Srem:
    530         HelperID = HasHWDiv ? RuntimeHelper::H_Num : RuntimeHelper::H_srem_i32;
    531         CastKind = InstCast::Sext;
    532         break;
    533       }
    534       if (HelperID == RuntimeHelper::H_Num) {
    535         // HelperID should only ever be undefined when the processor does not
    536         // have a hardware divider. If any other helpers are ever introduced,
    537         // the following assert will have to be modified.
    538         assert(HasHWDiv);
    539         return;
    540       }
    541       Operand *Src0 = Instr->getSrc(0);
    542       Operand *Src1 = Instr->getSrc(1);
    543       if (DestTy != IceType_i32) {
    544         // Src0 and Src1 have to be zero-, or signed-extended to i32. For Src0,
    545         // we just insert a InstCast right before the call to the helper.
    546         Variable *Src0_32 = Func->makeVariable(IceType_i32);
    547         Context.insert<InstCast>(CastKind, Src0_32, Src0);
    548         Src0 = Src0_32;
    549 
    550         // For extending Src1, we will just insert an InstCast if Src1 is not a
    551         // Constant. If it is, then we extend it here, and not during program
    552         // runtime. This allows preambleDivRem to optimize-out the div-by-0
    553         // check.
    554         if (auto *C = llvm::dyn_cast<ConstantInteger32>(Src1)) {
    555           const int32_t ShAmt = (DestTy == IceType_i16) ? 16 : 24;
    556           int32_t NewC = C->getValue();
    557           if (CastKind == InstCast::Zext) {
    558             NewC &= ~(0x80000000l >> ShAmt);
    559           } else {
    560             NewC = (NewC << ShAmt) >> ShAmt;
    561           }
    562           Src1 = Ctx->getConstantInt32(NewC);
    563         } else {
    564           Variable *Src1_32 = Func->makeVariable(IceType_i32);
    565           Context.insert<InstCast>(CastKind, Src1_32, Src1);
    566           Src1 = Src1_32;
    567         }
    568       }
    569       Operand *TargetHelper = Ctx->getRuntimeHelperFunc(HelperID);
    570       ARM32HelpersPreamble[TargetHelper] = &TargetARM32::preambleDivRem;
    571       constexpr SizeT MaxArgs = 2;
    572       auto *Call = Context.insert<InstCall>(MaxArgs, Dest, TargetHelper,
    573                                             NoTailCall, IsTargetHelperCall);
    574       assert(Src0->getType() == IceType_i32);
    575       Call->addArg(Src0);
    576       assert(Src1->getType() == IceType_i32);
    577       Call->addArg(Src1);
    578       Instr->setDeleted();
    579       return;
    580     }
    581     case IceType_f64:
    582     case IceType_f32: {
    583       if (Op != InstArithmetic::Frem) {
    584         return;
    585       }
    586       constexpr SizeT MaxArgs = 2;
    587       Operand *TargetHelper = Ctx->getRuntimeHelperFunc(
    588           DestTy == IceType_f32 ? RuntimeHelper::H_frem_f32
    589                                 : RuntimeHelper::H_frem_f64);
    590       auto *Call = Context.insert<InstCall>(MaxArgs, Dest, TargetHelper,
    591                                             NoTailCall, IsTargetHelperCall);
    592       Call->addArg(Instr->getSrc(0));
    593       Call->addArg(Instr->getSrc(1));
    594       Instr->setDeleted();
    595       return;
    596     }
    597     }
    598     llvm::report_fatal_error("Control flow should never have reached here.");
    599   }
    600   case Inst::Cast: {
    601     Variable *Dest = Instr->getDest();
    602     Operand *Src0 = Instr->getSrc(0);
    603     const Type DestTy = Dest->getType();
    604     const Type SrcTy = Src0->getType();
    605     auto *CastInstr = llvm::cast<InstCast>(Instr);
    606     const InstCast::OpKind CastKind = CastInstr->getCastKind();
    607 
    608     switch (CastKind) {
    609     default:
    610       return;
    611     case InstCast::Fptosi:
    612     case InstCast::Fptoui: {
    613       if (DestTy != IceType_i64) {
    614         return;
    615       }
    616       const bool DestIsSigned = CastKind == InstCast::Fptosi;
    617       const bool Src0IsF32 = isFloat32Asserting32Or64(SrcTy);
    618       Operand *TargetHelper = Ctx->getRuntimeHelperFunc(
    619           Src0IsF32 ? (DestIsSigned ? RuntimeHelper::H_fptosi_f32_i64
    620                                     : RuntimeHelper::H_fptoui_f32_i64)
    621                     : (DestIsSigned ? RuntimeHelper::H_fptosi_f64_i64
    622                                     : RuntimeHelper::H_fptoui_f64_i64));
    623       static constexpr SizeT MaxArgs = 1;
    624       auto *Call = Context.insert<InstCall>(MaxArgs, Dest, TargetHelper,
    625                                             NoTailCall, IsTargetHelperCall);
    626       Call->addArg(Src0);
    627       Instr->setDeleted();
    628       return;
    629     }
    630     case InstCast::Sitofp:
    631     case InstCast::Uitofp: {
    632       if (SrcTy != IceType_i64) {
    633         return;
    634       }
    635       const bool SourceIsSigned = CastKind == InstCast::Sitofp;
    636       const bool DestIsF32 = isFloat32Asserting32Or64(Dest->getType());
    637       Operand *TargetHelper = Ctx->getRuntimeHelperFunc(
    638           DestIsF32 ? (SourceIsSigned ? RuntimeHelper::H_sitofp_i64_f32
    639                                       : RuntimeHelper::H_uitofp_i64_f32)
    640                     : (SourceIsSigned ? RuntimeHelper::H_sitofp_i64_f64
    641                                       : RuntimeHelper::H_uitofp_i64_f64));
    642       static constexpr SizeT MaxArgs = 1;
    643       auto *Call = Context.insert<InstCall>(MaxArgs, Dest, TargetHelper,
    644                                             NoTailCall, IsTargetHelperCall);
    645       Call->addArg(Src0);
    646       Instr->setDeleted();
    647       return;
    648     }
    649     case InstCast::Bitcast: {
    650       if (DestTy == SrcTy) {
    651         return;
    652       }
    653       Variable *CallDest = Dest;
    654       RuntimeHelper HelperID = RuntimeHelper::H_Num;
    655       switch (DestTy) {
    656       default:
    657         return;
    658       case IceType_i8:
    659         assert(SrcTy == IceType_v8i1);
    660         HelperID = RuntimeHelper::H_bitcast_8xi1_i8;
    661         CallDest = Func->makeVariable(IceType_i32);
    662         break;
    663       case IceType_i16:
    664         assert(SrcTy == IceType_v16i1);
    665         HelperID = RuntimeHelper::H_bitcast_16xi1_i16;
    666         CallDest = Func->makeVariable(IceType_i32);
    667         break;
    668       case IceType_v8i1: {
    669         assert(SrcTy == IceType_i8);
    670         HelperID = RuntimeHelper::H_bitcast_i8_8xi1;
    671         Variable *Src0AsI32 = Func->makeVariable(stackSlotType());
    672         // Arguments to functions are required to be at least 32 bits wide.
    673         Context.insert<InstCast>(InstCast::Zext, Src0AsI32, Src0);
    674         Src0 = Src0AsI32;
    675       } break;
    676       case IceType_v16i1: {
    677         assert(SrcTy == IceType_i16);
    678         HelperID = RuntimeHelper::H_bitcast_i16_16xi1;
    679         Variable *Src0AsI32 = Func->makeVariable(stackSlotType());
    680         // Arguments to functions are required to be at least 32 bits wide.
    681         Context.insert<InstCast>(InstCast::Zext, Src0AsI32, Src0);
    682         Src0 = Src0AsI32;
    683       } break;
    684       }
    685       constexpr SizeT MaxSrcs = 1;
    686       InstCall *Call = makeHelperCall(HelperID, CallDest, MaxSrcs);
    687       Call->addArg(Src0);
    688       Context.insert(Call);
    689       // The PNaCl ABI disallows i8/i16 return types, so truncate the helper
    690       // call result to the appropriate type as necessary.
    691       if (CallDest->getType() != Dest->getType())
    692         Context.insert<InstCast>(InstCast::Trunc, Dest, CallDest);
    693       Instr->setDeleted();
    694       return;
    695     }
    696     case InstCast::Trunc: {
    697       if (DestTy == SrcTy) {
    698         return;
    699       }
    700       if (!isVectorType(SrcTy)) {
    701         return;
    702       }
    703       assert(typeNumElements(DestTy) == typeNumElements(SrcTy));
    704       assert(typeElementType(DestTy) == IceType_i1);
    705       assert(isVectorIntegerType(SrcTy));
    706       return;
    707     }
    708     case InstCast::Sext:
    709     case InstCast::Zext: {
    710       if (DestTy == SrcTy) {
    711         return;
    712       }
    713       if (!isVectorType(DestTy)) {
    714         return;
    715       }
    716       assert(typeNumElements(DestTy) == typeNumElements(SrcTy));
    717       assert(typeElementType(SrcTy) == IceType_i1);
    718       assert(isVectorIntegerType(DestTy));
    719       return;
    720     }
    721     }
    722     llvm::report_fatal_error("Control flow should never have reached here.");
    723   }
    724   case Inst::IntrinsicCall: {
    725     Variable *Dest = Instr->getDest();
    726     auto *IntrinsicCall = llvm::cast<InstIntrinsicCall>(Instr);
    727     Intrinsics::IntrinsicID ID = IntrinsicCall->getIntrinsicInfo().ID;
    728     switch (ID) {
    729     default:
    730       return;
    731     case Intrinsics::Ctpop: {
    732       Operand *Src0 = IntrinsicCall->getArg(0);
    733       Operand *TargetHelper =
    734           Ctx->getRuntimeHelperFunc(isInt32Asserting32Or64(Src0->getType())
    735                                         ? RuntimeHelper::H_call_ctpop_i32
    736                                         : RuntimeHelper::H_call_ctpop_i64);
    737       static constexpr SizeT MaxArgs = 1;
    738       auto *Call = Context.insert<InstCall>(MaxArgs, Dest, TargetHelper,
    739                                             NoTailCall, IsTargetHelperCall);
    740       Call->addArg(Src0);
    741       Instr->setDeleted();
    742       if (Src0->getType() == IceType_i64) {
    743         ARM32HelpersPostamble[TargetHelper] = &TargetARM32::postambleCtpop64;
    744       }
    745       return;
    746     }
    747     case Intrinsics::Longjmp: {
    748       static constexpr SizeT MaxArgs = 2;
    749       static constexpr Variable *NoDest = nullptr;
    750       Operand *TargetHelper =
    751           Ctx->getRuntimeHelperFunc(RuntimeHelper::H_call_longjmp);
    752       auto *Call = Context.insert<InstCall>(MaxArgs, NoDest, TargetHelper,
    753                                             NoTailCall, IsTargetHelperCall);
    754       Call->addArg(IntrinsicCall->getArg(0));
    755       Call->addArg(IntrinsicCall->getArg(1));
    756       Instr->setDeleted();
    757       return;
    758     }
    759     case Intrinsics::Memcpy: {
    760       // In the future, we could potentially emit an inline memcpy/memset, etc.
    761       // for intrinsic calls w/ a known length.
    762       static constexpr SizeT MaxArgs = 3;
    763       static constexpr Variable *NoDest = nullptr;
    764       Operand *TargetHelper =
    765           Ctx->getRuntimeHelperFunc(RuntimeHelper::H_call_memcpy);
    766       auto *Call = Context.insert<InstCall>(MaxArgs, NoDest, TargetHelper,
    767                                             NoTailCall, IsTargetHelperCall);
    768       Call->addArg(IntrinsicCall->getArg(0));
    769       Call->addArg(IntrinsicCall->getArg(1));
    770       Call->addArg(IntrinsicCall->getArg(2));
    771       Instr->setDeleted();
    772       return;
    773     }
    774     case Intrinsics::Memmove: {
    775       static constexpr SizeT MaxArgs = 3;
    776       static constexpr Variable *NoDest = nullptr;
    777       Operand *TargetHelper =
    778           Ctx->getRuntimeHelperFunc(RuntimeHelper::H_call_memmove);
    779       auto *Call = Context.insert<InstCall>(MaxArgs, NoDest, TargetHelper,
    780                                             NoTailCall, IsTargetHelperCall);
    781       Call->addArg(IntrinsicCall->getArg(0));
    782       Call->addArg(IntrinsicCall->getArg(1));
    783       Call->addArg(IntrinsicCall->getArg(2));
    784       Instr->setDeleted();
    785       return;
    786     }
    787     case Intrinsics::Memset: {
    788       // The value operand needs to be extended to a stack slot size because the
    789       // PNaCl ABI requires arguments to be at least 32 bits wide.
    790       Operand *ValOp = IntrinsicCall->getArg(1);
    791       assert(ValOp->getType() == IceType_i8);
    792       Variable *ValExt = Func->makeVariable(stackSlotType());
    793       Context.insert<InstCast>(InstCast::Zext, ValExt, ValOp);
    794 
    795       // Technically, ARM has its own __aeabi_memset, but we can use plain
    796       // memset too. The value and size argument need to be flipped if we ever
    797       // decide to use __aeabi_memset.
    798       static constexpr SizeT MaxArgs = 3;
    799       static constexpr Variable *NoDest = nullptr;
    800       Operand *TargetHelper =
    801           Ctx->getRuntimeHelperFunc(RuntimeHelper::H_call_memset);
    802       auto *Call = Context.insert<InstCall>(MaxArgs, NoDest, TargetHelper,
    803                                             NoTailCall, IsTargetHelperCall);
    804       Call->addArg(IntrinsicCall->getArg(0));
    805       Call->addArg(ValExt);
    806       Call->addArg(IntrinsicCall->getArg(2));
    807       Instr->setDeleted();
    808       return;
    809     }
    810     case Intrinsics::NaClReadTP: {
    811       if (SandboxingType == ST_NaCl) {
    812         return;
    813       }
    814       static constexpr SizeT MaxArgs = 0;
    815       Operand *TargetHelper =
    816           SandboxingType == ST_Nonsfi
    817               ? Ctx->getConstantExternSym(
    818                     Ctx->getGlobalString("__aeabi_read_tp"))
    819               : Ctx->getRuntimeHelperFunc(RuntimeHelper::H_call_read_tp);
    820       Context.insert<InstCall>(MaxArgs, Dest, TargetHelper, NoTailCall,
    821                                IsTargetHelperCall);
    822       Instr->setDeleted();
    823       return;
    824     }
    825     case Intrinsics::Setjmp: {
    826       static constexpr SizeT MaxArgs = 1;
    827       Operand *TargetHelper =
    828           Ctx->getRuntimeHelperFunc(RuntimeHelper::H_call_setjmp);
    829       auto *Call = Context.insert<InstCall>(MaxArgs, Dest, TargetHelper,
    830                                             NoTailCall, IsTargetHelperCall);
    831       Call->addArg(IntrinsicCall->getArg(0));
    832       Instr->setDeleted();
    833       return;
    834     }
    835     }
    836     llvm::report_fatal_error("Control flow should never have reached here.");
    837   }
    838   }
    839 }
    840 
    841 void TargetARM32::findMaxStackOutArgsSize() {
    842   // MinNeededOutArgsBytes should be updated if the Target ever creates a
    843   // high-level InstCall that requires more stack bytes.
    844   constexpr size_t MinNeededOutArgsBytes = 0;
    845   MaxOutArgsSizeBytes = MinNeededOutArgsBytes;
    846   for (CfgNode *Node : Func->getNodes()) {
    847     Context.init(Node);
    848     while (!Context.atEnd()) {
    849       PostIncrLoweringContext PostIncrement(Context);
    850       Inst *CurInstr = iteratorToInst(Context.getCur());
    851       if (auto *Call = llvm::dyn_cast<InstCall>(CurInstr)) {
    852         SizeT OutArgsSizeBytes = getCallStackArgumentsSizeBytes(Call);
    853         MaxOutArgsSizeBytes = std::max(MaxOutArgsSizeBytes, OutArgsSizeBytes);
    854       }
    855     }
    856   }
    857 }
    858 
    859 void TargetARM32::createGotPtr() {
    860   if (SandboxingType != ST_Nonsfi) {
    861     return;
    862   }
    863   GotPtr = Func->makeVariable(IceType_i32);
    864 }
    865 
    866 void TargetARM32::insertGotPtrInitPlaceholder() {
    867   if (SandboxingType != ST_Nonsfi) {
    868     return;
    869   }
    870   assert(GotPtr != nullptr);
    871   // We add the two placeholder instructions here. The first fakedefs T, an
    872   // infinite-weight temporary, while the second fakedefs the GotPtr "using" T.
    873   // This is needed because the GotPtr initialization, if needed, will require
    874   // a register:
    875   //
    876   //   movw     reg, _GLOBAL_OFFSET_TABLE_ - 16 - .
    877   //   movt     reg, _GLOBAL_OFFSET_TABLE_ - 12 - .
    878   //   add      reg, pc, reg
    879   //   mov      GotPtr, reg
    880   //
    881   // If GotPtr is not used, then both these pseudo-instructions are dce'd.
    882   Variable *T = makeReg(IceType_i32);
    883   Context.insert<InstFakeDef>(T);
    884   Context.insert<InstFakeDef>(GotPtr, T);
    885 }
    886 
    887 GlobalString
    888 TargetARM32::createGotoffRelocation(const ConstantRelocatable *CR) {
    889   GlobalString CRName = CR->getName();
    890   GlobalString CRGotoffName =
    891       Ctx->getGlobalString("GOTOFF$" + Func->getFunctionName() + "$" + CRName);
    892   if (KnownGotoffs.count(CRGotoffName) == 0) {
    893     constexpr bool SuppressMangling = true;
    894     auto *Global =
    895         VariableDeclaration::create(Func->getGlobalPool(), SuppressMangling);
    896     Global->setIsConstant(true);
    897     Global->setName(CRName);
    898     Func->getGlobalPool()->willNotBeEmitted(Global);
    899 
    900     auto *Gotoff =
    901         VariableDeclaration::create(Func->getGlobalPool(), SuppressMangling);
    902     constexpr auto GotFixup = R_ARM_GOTOFF32;
    903     Gotoff->setIsConstant(true);
    904     Gotoff->addInitializer(VariableDeclaration::RelocInitializer::create(
    905         Func->getGlobalPool(), Global, {RelocOffset::create(Ctx, 0)},
    906         GotFixup));
    907     Gotoff->setName(CRGotoffName);
    908     Func->addGlobal(Gotoff);
    909     KnownGotoffs.emplace(CRGotoffName);
    910   }
    911   return CRGotoffName;
    912 }
    913 
    914 void TargetARM32::materializeGotAddr(CfgNode *Node) {
    915   if (SandboxingType != ST_Nonsfi) {
    916     return;
    917   }
    918 
    919   // At first, we try to find the
    920   //    GotPtr = def T
    921   // pseudo-instruction that we placed for defining the got ptr. That
    922   // instruction is not just a place-holder for defining the GotPtr (thus
    923   // keeping liveness consistent), but it is also located at a point where it is
    924   // safe to materialize the got addr -- i.e., before loading parameters to
    925   // registers, but after moving register parameters from their home location.
    926   InstFakeDef *DefGotPtr = nullptr;
    927   for (auto &Inst : Node->getInsts()) {
    928     auto *FakeDef = llvm::dyn_cast<InstFakeDef>(&Inst);
    929     if (FakeDef != nullptr && FakeDef->getDest() == GotPtr) {
    930       DefGotPtr = FakeDef;
    931       break;
    932     }
    933   }
    934 
    935   if (DefGotPtr == nullptr || DefGotPtr->isDeleted()) {
    936     return;
    937   }
    938 
    939   // The got addr needs to be materialized at the same point where DefGotPtr
    940   // lives.
    941   Context.setInsertPoint(instToIterator(DefGotPtr));
    942   assert(DefGotPtr->getSrcSize() == 1);
    943   auto *T = llvm::cast<Variable>(DefGotPtr->getSrc(0));
    944   loadNamedConstantRelocatablePIC(Ctx->getGlobalString(GlobalOffsetTable), T,
    945                                   [this, T](Variable *PC) { _add(T, PC, T); });
    946   _mov(GotPtr, T);
    947   DefGotPtr->setDeleted();
    948 }
    949 
    950 void TargetARM32::loadNamedConstantRelocatablePIC(
    951     GlobalString Name, Variable *Register,
    952     std::function<void(Variable *PC)> Finish) {
    953   assert(SandboxingType == ST_Nonsfi);
    954   // We makeReg() here instead of getPhysicalRegister() because the latter ends
    955   // up creating multi-blocks temporaries that liveness fails to validate.
    956   auto *PC = makeReg(IceType_i32, RegARM32::Reg_pc);
    957 
    958   auto *AddPcReloc = RelocOffset::create(Ctx);
    959   AddPcReloc->setSubtract(true);
    960   auto *AddPcLabel = InstARM32Label::create(Func, this);
    961   AddPcLabel->setRelocOffset(AddPcReloc);
    962 
    963   auto *MovwReloc = RelocOffset::create(Ctx);
    964   auto *MovwLabel = InstARM32Label::create(Func, this);
    965   MovwLabel->setRelocOffset(MovwReloc);
    966 
    967   auto *MovtReloc = RelocOffset::create(Ctx);
    968   auto *MovtLabel = InstARM32Label::create(Func, this);
    969   MovtLabel->setRelocOffset(MovtReloc);
    970 
    971   // The EmitString for these constant relocatables have hardcoded offsets
    972   // attached to them. This could be dangerous if, e.g., we ever implemented
    973   // instruction scheduling but llvm-mc currently does not support
    974   //
    975   //   movw reg, #:lower16:(Symbol - Label - Number)
    976   //   movt reg, #:upper16:(Symbol - Label - Number)
    977   //
    978   // relocations.
    979   static constexpr RelocOffsetT PcOffset = -8;
    980   auto *CRLower = Ctx->getConstantSymWithEmitString(
    981       PcOffset, {MovwReloc, AddPcReloc}, Name, Name + " -16");
    982   auto *CRUpper = Ctx->getConstantSymWithEmitString(
    983       PcOffset, {MovtReloc, AddPcReloc}, Name, Name + " -12");
    984 
    985   Context.insert(MovwLabel);
    986   _movw(Register, CRLower);
    987   Context.insert(MovtLabel);
    988   _movt(Register, CRUpper);
    989   // PC = fake-def to keep liveness consistent.
    990   Context.insert<InstFakeDef>(PC);
    991   Context.insert(AddPcLabel);
    992   Finish(PC);
    993 }
    994 
    995 void TargetARM32::translateO2() {
    996   TimerMarker T(TimerStack::TT_O2, Func);
    997 
    998   // TODO(stichnot): share passes with other targets?
    999   // https://code.google.com/p/nativeclient/issues/detail?id=4094
   1000   if (SandboxingType == ST_Nonsfi) {
   1001     createGotPtr();
   1002   }
   1003   genTargetHelperCalls();
   1004   findMaxStackOutArgsSize();
   1005 
   1006   // Do not merge Alloca instructions, and lay out the stack.
   1007   static constexpr bool SortAndCombineAllocas = true;
   1008   Func->processAllocas(SortAndCombineAllocas);
   1009   Func->dump("After Alloca processing");
   1010 
   1011   if (!getFlags().getEnablePhiEdgeSplit()) {
   1012     // Lower Phi instructions.
   1013     Func->placePhiLoads();
   1014     if (Func->hasError())
   1015       return;
   1016     Func->placePhiStores();
   1017     if (Func->hasError())
   1018       return;
   1019     Func->deletePhis();
   1020     if (Func->hasError())
   1021       return;
   1022     Func->dump("After Phi lowering");
   1023   }
   1024 
   1025   // Address mode optimization.
   1026   Func->getVMetadata()->init(VMK_SingleDefs);
   1027   Func->doAddressOpt();
   1028   Func->materializeVectorShuffles();
   1029 
   1030   // Argument lowering
   1031   Func->doArgLowering();
   1032 
   1033   // Target lowering. This requires liveness analysis for some parts of the
   1034   // lowering decisions, such as compare/branch fusing. If non-lightweight
   1035   // liveness analysis is used, the instructions need to be renumbered first.
   1036   // TODO: This renumbering should only be necessary if we're actually
   1037   // calculating live intervals, which we only do for register allocation.
   1038   Func->renumberInstructions();
   1039   if (Func->hasError())
   1040     return;
   1041 
   1042   // TODO: It should be sufficient to use the fastest liveness calculation,
   1043   // i.e. livenessLightweight(). However, for some reason that slows down the
   1044   // rest of the translation. Investigate.
   1045   Func->liveness(Liveness_Basic);
   1046   if (Func->hasError())
   1047     return;
   1048   Func->dump("After ARM32 address mode opt");
   1049 
   1050   if (SandboxingType == ST_Nonsfi) {
   1051     insertGotPtrInitPlaceholder();
   1052   }
   1053   Func->genCode();
   1054   if (Func->hasError())
   1055     return;
   1056   Func->dump("After ARM32 codegen");
   1057 
   1058   // Register allocation. This requires instruction renumbering and full
   1059   // liveness analysis.
   1060   Func->renumberInstructions();
   1061   if (Func->hasError())
   1062     return;
   1063   Func->liveness(Liveness_Intervals);
   1064   if (Func->hasError())
   1065     return;
   1066   // The post-codegen dump is done here, after liveness analysis and associated
   1067   // cleanup, to make the dump cleaner and more useful.
   1068   Func->dump("After initial ARM32 codegen");
   1069   // Validate the live range computations. The expensive validation call is
   1070   // deliberately only made when assertions are enabled.
   1071   assert(Func->validateLiveness());
   1072   Func->getVMetadata()->init(VMK_All);
   1073   regAlloc(RAK_Global);
   1074   if (Func->hasError())
   1075     return;
   1076 
   1077   copyRegAllocFromInfWeightVariable64On32(Func->getVariables());
   1078   Func->dump("After linear scan regalloc");
   1079 
   1080   if (getFlags().getEnablePhiEdgeSplit()) {
   1081     Func->advancedPhiLowering();
   1082     Func->dump("After advanced Phi lowering");
   1083   }
   1084 
   1085   ForbidTemporaryWithoutReg _(this);
   1086 
   1087   // Stack frame mapping.
   1088   Func->genFrame();
   1089   if (Func->hasError())
   1090     return;
   1091   Func->dump("After stack frame mapping");
   1092 
   1093   postLowerLegalization();
   1094   if (Func->hasError())
   1095     return;
   1096   Func->dump("After postLowerLegalization");
   1097 
   1098   Func->contractEmptyNodes();
   1099   Func->reorderNodes();
   1100 
   1101   // Branch optimization. This needs to be done just before code emission. In
   1102   // particular, no transformations that insert or reorder CfgNodes should be
   1103   // done after branch optimization. We go ahead and do it before nop insertion
   1104   // to reduce the amount of work needed for searching for opportunities.
   1105   Func->doBranchOpt();
   1106   Func->dump("After branch optimization");
   1107 
   1108   // Nop insertion
   1109   if (getFlags().getShouldDoNopInsertion()) {
   1110     Func->doNopInsertion();
   1111   }
   1112 }
   1113 
   1114 void TargetARM32::translateOm1() {
   1115   TimerMarker T(TimerStack::TT_Om1, Func);
   1116 
   1117   // TODO(stichnot): share passes with other targets?
   1118   if (SandboxingType == ST_Nonsfi) {
   1119     createGotPtr();
   1120   }
   1121 
   1122   genTargetHelperCalls();
   1123   findMaxStackOutArgsSize();
   1124 
   1125   // Do not merge Alloca instructions, and lay out the stack.
   1126   static constexpr bool DontSortAndCombineAllocas = false;
   1127   Func->processAllocas(DontSortAndCombineAllocas);
   1128   Func->dump("After Alloca processing");
   1129 
   1130   Func->placePhiLoads();
   1131   if (Func->hasError())
   1132     return;
   1133   Func->placePhiStores();
   1134   if (Func->hasError())
   1135     return;
   1136   Func->deletePhis();
   1137   if (Func->hasError())
   1138     return;
   1139   Func->dump("After Phi lowering");
   1140 
   1141   Func->doArgLowering();
   1142 
   1143   if (SandboxingType == ST_Nonsfi) {
   1144     insertGotPtrInitPlaceholder();
   1145   }
   1146   Func->genCode();
   1147   if (Func->hasError())
   1148     return;
   1149   Func->dump("After initial ARM32 codegen");
   1150 
   1151   regAlloc(RAK_InfOnly);
   1152   if (Func->hasError())
   1153     return;
   1154 
   1155   copyRegAllocFromInfWeightVariable64On32(Func->getVariables());
   1156   Func->dump("After regalloc of infinite-weight variables");
   1157 
   1158   ForbidTemporaryWithoutReg _(this);
   1159 
   1160   Func->genFrame();
   1161   if (Func->hasError())
   1162     return;
   1163   Func->dump("After stack frame mapping");
   1164 
   1165   postLowerLegalization();
   1166   if (Func->hasError())
   1167     return;
   1168   Func->dump("After postLowerLegalization");
   1169 
   1170   // Nop insertion
   1171   if (getFlags().getShouldDoNopInsertion()) {
   1172     Func->doNopInsertion();
   1173   }
   1174 }
   1175 
   1176 uint32_t TargetARM32::getStackAlignment() const {
   1177   return ARM32_STACK_ALIGNMENT_BYTES;
   1178 }
   1179 
   1180 bool TargetARM32::doBranchOpt(Inst *I, const CfgNode *NextNode) {
   1181   if (auto *Br = llvm::dyn_cast<InstARM32Br>(I)) {
   1182     return Br->optimizeBranch(NextNode);
   1183   }
   1184   return false;
   1185 }
   1186 
   1187 const char *TargetARM32::getRegName(RegNumT RegNum, Type Ty) const {
   1188   (void)Ty;
   1189   return RegARM32::getRegName(RegNum);
   1190 }
   1191 
   1192 Variable *TargetARM32::getPhysicalRegister(RegNumT RegNum, Type Ty) {
   1193   static const Type DefaultType[] = {
   1194 #define X(val, encode, name, cc_arg, scratch, preserved, stackptr, frameptr,   \
   1195           isGPR, isInt, isI64Pair, isFP32, isFP64, isVec128, alias_init)       \
   1196   (isFP32)                                                                     \
   1197       ? IceType_f32                                                            \
   1198       : ((isFP64) ? IceType_f64 : ((isVec128 ? IceType_v4i32 : IceType_i32))),
   1199       REGARM32_TABLE
   1200 #undef X
   1201   };
   1202 
   1203   if (Ty == IceType_void) {
   1204     assert(unsigned(RegNum) < llvm::array_lengthof(DefaultType));
   1205     Ty = DefaultType[RegNum];
   1206   }
   1207   if (PhysicalRegisters[Ty].empty())
   1208     PhysicalRegisters[Ty].resize(RegARM32::Reg_NUM);
   1209   assert(unsigned(RegNum) < PhysicalRegisters[Ty].size());
   1210   Variable *Reg = PhysicalRegisters[Ty][RegNum];
   1211   if (Reg == nullptr) {
   1212     Reg = Func->makeVariable(Ty);
   1213     Reg->setRegNum(RegNum);
   1214     PhysicalRegisters[Ty][RegNum] = Reg;
   1215     // Specially mark a named physical register as an "argument" so that it is
   1216     // considered live upon function entry.  Otherwise it's possible to get
   1217     // liveness validation errors for saving callee-save registers.
   1218     Func->addImplicitArg(Reg);
   1219     // Don't bother tracking the live range of a named physical register.
   1220     Reg->setIgnoreLiveness();
   1221   }
   1222   return Reg;
   1223 }
   1224 
   1225 void TargetARM32::emitJumpTable(const Cfg *Func,
   1226                                 const InstJumpTable *JumpTable) const {
   1227   (void)Func;
   1228   (void)JumpTable;
   1229   UnimplementedError(getFlags());
   1230 }
   1231 
   1232 void TargetARM32::emitVariable(const Variable *Var) const {
   1233   if (!BuildDefs::dump())
   1234     return;
   1235   Ostream &Str = Ctx->getStrEmit();
   1236   if (Var->hasReg()) {
   1237     Str << getRegName(Var->getRegNum(), Var->getType());
   1238     return;
   1239   }
   1240   if (Var->mustHaveReg()) {
   1241     llvm::report_fatal_error("Infinite-weight Variable (" + Var->getName() +
   1242                              ") has no register assigned - function " +
   1243                              Func->getFunctionName());
   1244   }
   1245   assert(!Var->isRematerializable());
   1246   int32_t Offset = Var->getStackOffset();
   1247   auto BaseRegNum = Var->getBaseRegNum();
   1248   if (BaseRegNum.hasNoValue()) {
   1249     BaseRegNum = getFrameOrStackReg();
   1250   }
   1251   const Type VarTy = Var->getType();
   1252   Str << "[" << getRegName(BaseRegNum, VarTy);
   1253   if (Offset != 0) {
   1254     Str << ", #" << Offset;
   1255   }
   1256   Str << "]";
   1257 }
   1258 
   1259 TargetARM32::CallingConv::CallingConv()
   1260     : GPRegsUsed(RegARM32::Reg_NUM),
   1261       GPRArgs(GPRArgInitializer.rbegin(), GPRArgInitializer.rend()),
   1262       I64Args(I64ArgInitializer.rbegin(), I64ArgInitializer.rend()),
   1263       VFPRegsUsed(RegARM32::Reg_NUM),
   1264       FP32Args(FP32ArgInitializer.rbegin(), FP32ArgInitializer.rend()),
   1265       FP64Args(FP64ArgInitializer.rbegin(), FP64ArgInitializer.rend()),
   1266       Vec128Args(Vec128ArgInitializer.rbegin(), Vec128ArgInitializer.rend()) {}
   1267 
   1268 bool TargetARM32::CallingConv::argInGPR(Type Ty, RegNumT *Reg) {
   1269   CfgVector<RegNumT> *Source;
   1270 
   1271   switch (Ty) {
   1272   default: {
   1273     assert(isScalarIntegerType(Ty));
   1274     Source = &GPRArgs;
   1275   } break;
   1276   case IceType_i64: {
   1277     Source = &I64Args;
   1278   } break;
   1279   }
   1280 
   1281   discardUnavailableGPRsAndTheirAliases(Source);
   1282 
   1283   if (Source->empty()) {
   1284     GPRegsUsed.set();
   1285     return false;
   1286   }
   1287 
   1288   *Reg = Source->back();
   1289   // Note that we don't Source->pop_back() here. This is intentional. Notice how
   1290   // we mark all of Reg's aliases as Used. So, for the next argument,
   1291   // Source->back() is marked as unavailable, and it is thus implicitly popped
   1292   // from the stack.
   1293   GPRegsUsed |= RegisterAliases[*Reg];
   1294   return true;
   1295 }
   1296 
   1297 // GPR are not packed when passing parameters. Thus, a function foo(i32, i64,
   1298 // i32) will have the first argument in r0, the second in r1-r2, and the third
   1299 // on the stack. To model this behavior, whenever we pop a register from Regs,
   1300 // we remove all of its aliases from the pool of available GPRs. This has the
   1301 // effect of computing the "closure" on the GPR registers.
   1302 void TargetARM32::CallingConv::discardUnavailableGPRsAndTheirAliases(
   1303     CfgVector<RegNumT> *Regs) {
   1304   while (!Regs->empty() && GPRegsUsed[Regs->back()]) {
   1305     GPRegsUsed |= RegisterAliases[Regs->back()];
   1306     Regs->pop_back();
   1307   }
   1308 }
   1309 
   1310 bool TargetARM32::CallingConv::argInVFP(Type Ty, RegNumT *Reg) {
   1311   CfgVector<RegNumT> *Source;
   1312 
   1313   switch (Ty) {
   1314   default: {
   1315     assert(isVectorType(Ty));
   1316     Source = &Vec128Args;
   1317   } break;
   1318   case IceType_f32: {
   1319     Source = &FP32Args;
   1320   } break;
   1321   case IceType_f64: {
   1322     Source = &FP64Args;
   1323   } break;
   1324   }
   1325 
   1326   discardUnavailableVFPRegs(Source);
   1327 
   1328   if (Source->empty()) {
   1329     VFPRegsUsed.set();
   1330     return false;
   1331   }
   1332 
   1333   *Reg = Source->back();
   1334   VFPRegsUsed |= RegisterAliases[*Reg];
   1335   return true;
   1336 }
   1337 
   1338 // Arguments in VFP registers are not packed, so we don't mark the popped
   1339 // registers' aliases as unavailable.
   1340 void TargetARM32::CallingConv::discardUnavailableVFPRegs(
   1341     CfgVector<RegNumT> *Regs) {
   1342   while (!Regs->empty() && VFPRegsUsed[Regs->back()]) {
   1343     Regs->pop_back();
   1344   }
   1345 }
   1346 
   1347 void TargetARM32::lowerArguments() {
   1348   VarList &Args = Func->getArgs();
   1349   TargetARM32::CallingConv CC;
   1350 
   1351   // For each register argument, replace Arg in the argument list with the home
   1352   // register. Then generate an instruction in the prolog to copy the home
   1353   // register to the assigned location of Arg.
   1354   Context.init(Func->getEntryNode());
   1355   Context.setInsertPoint(Context.getCur());
   1356 
   1357   for (SizeT I = 0, E = Args.size(); I < E; ++I) {
   1358     Variable *Arg = Args[I];
   1359     Type Ty = Arg->getType();
   1360     RegNumT RegNum;
   1361     if (isScalarIntegerType(Ty)) {
   1362       if (!CC.argInGPR(Ty, &RegNum)) {
   1363         continue;
   1364       }
   1365     } else {
   1366       if (!CC.argInVFP(Ty, &RegNum)) {
   1367         continue;
   1368       }
   1369     }
   1370 
   1371     Variable *RegisterArg = Func->makeVariable(Ty);
   1372     if (BuildDefs::dump()) {
   1373       RegisterArg->setName(Func, "home_reg:" + Arg->getName());
   1374     }
   1375     RegisterArg->setIsArg();
   1376     Arg->setIsArg(false);
   1377     Args[I] = RegisterArg;
   1378     switch (Ty) {
   1379     default: { RegisterArg->setRegNum(RegNum); } break;
   1380     case IceType_i64: {
   1381       auto *RegisterArg64 = llvm::cast<Variable64On32>(RegisterArg);
   1382       RegisterArg64->initHiLo(Func);
   1383       RegisterArg64->getLo()->setRegNum(
   1384           RegNumT::fixme(RegARM32::getI64PairFirstGPRNum(RegNum)));
   1385       RegisterArg64->getHi()->setRegNum(
   1386           RegNumT::fixme(RegARM32::getI64PairSecondGPRNum(RegNum)));
   1387     } break;
   1388     }
   1389     Context.insert<InstAssign>(Arg, RegisterArg);
   1390   }
   1391 }
   1392 
   1393 // Helper function for addProlog().
   1394 //
   1395 // This assumes Arg is an argument passed on the stack. This sets the frame
   1396 // offset for Arg and updates InArgsSizeBytes according to Arg's width. For an
   1397 // I64 arg that has been split into Lo and Hi components, it calls itself
   1398 // recursively on the components, taking care to handle Lo first because of the
   1399 // little-endian architecture. Lastly, this function generates an instruction
   1400 // to copy Arg into its assigned register if applicable.
   1401 void TargetARM32::finishArgumentLowering(Variable *Arg, Variable *FramePtr,
   1402                                          size_t BasicFrameOffset,
   1403                                          size_t *InArgsSizeBytes) {
   1404   const Type Ty = Arg->getType();
   1405   *InArgsSizeBytes = applyStackAlignmentTy(*InArgsSizeBytes, Ty);
   1406 
   1407   if (auto *Arg64On32 = llvm::dyn_cast<Variable64On32>(Arg)) {
   1408     Variable *const Lo = Arg64On32->getLo();
   1409     Variable *const Hi = Arg64On32->getHi();
   1410     finishArgumentLowering(Lo, FramePtr, BasicFrameOffset, InArgsSizeBytes);
   1411     finishArgumentLowering(Hi, FramePtr, BasicFrameOffset, InArgsSizeBytes);
   1412     return;
   1413   }
   1414   assert(Ty != IceType_i64);
   1415 
   1416   const int32_t ArgStackOffset = BasicFrameOffset + *InArgsSizeBytes;
   1417   *InArgsSizeBytes += typeWidthInBytesOnStack(Ty);
   1418 
   1419   if (!Arg->hasReg()) {
   1420     Arg->setStackOffset(ArgStackOffset);
   1421     return;
   1422   }
   1423 
   1424   // If the argument variable has been assigned a register, we need to copy the
   1425   // value from the stack slot.
   1426   Variable *Parameter = Func->makeVariable(Ty);
   1427   Parameter->setMustNotHaveReg();
   1428   Parameter->setStackOffset(ArgStackOffset);
   1429   _mov(Arg, Parameter);
   1430 }
   1431 
   1432 Type TargetARM32::stackSlotType() { return IceType_i32; }
   1433 
   1434 void TargetARM32::addProlog(CfgNode *Node) {
   1435   // Stack frame layout:
   1436   //
   1437   // +------------------------+
   1438   // | 1. preserved registers |
   1439   // +------------------------+
   1440   // | 2. padding             |
   1441   // +------------------------+ <--- FramePointer (if used)
   1442   // | 3. global spill area   |
   1443   // +------------------------+
   1444   // | 4. padding             |
   1445   // +------------------------+
   1446   // | 5. local spill area    |
   1447   // +------------------------+
   1448   // | 6. padding             |
   1449   // +------------------------+
   1450   // | 7. allocas (variable)  |
   1451   // +------------------------+
   1452   // | 8. padding             |
   1453   // +------------------------+
   1454   // | 9. out args            |
   1455   // +------------------------+ <--- StackPointer
   1456   //
   1457   // The following variables record the size in bytes of the given areas:
   1458   //  * PreservedRegsSizeBytes: area 1
   1459   //  * SpillAreaPaddingBytes:  area 2
   1460   //  * GlobalsSize:            area 3
   1461   //  * GlobalsAndSubsequentPaddingSize: areas 3 - 4
   1462   //  * LocalsSpillAreaSize:    area 5
   1463   //  * SpillAreaSizeBytes:     areas 2 - 6, and 9
   1464   //  * MaxOutArgsSizeBytes:    area 9
   1465   //
   1466   // Determine stack frame offsets for each Variable without a register
   1467   // assignment.  This can be done as one variable per stack slot.  Or, do
   1468   // coalescing by running the register allocator again with an infinite set of
   1469   // registers (as a side effect, this gives variables a second chance at
   1470   // physical register assignment).
   1471   //
   1472   // A middle ground approach is to leverage sparsity and allocate one block of
   1473   // space on the frame for globals (variables with multi-block lifetime), and
   1474   // one block to share for locals (single-block lifetime).
   1475 
   1476   Context.init(Node);
   1477   Context.setInsertPoint(Context.getCur());
   1478 
   1479   SmallBitVector CalleeSaves = getRegisterSet(RegSet_CalleeSave, RegSet_None);
   1480   RegsUsed = SmallBitVector(CalleeSaves.size());
   1481   VarList SortedSpilledVariables;
   1482   size_t GlobalsSize = 0;
   1483   // If there is a separate locals area, this represents that area. Otherwise
   1484   // it counts any variable not counted by GlobalsSize.
   1485   SpillAreaSizeBytes = 0;
   1486   // If there is a separate locals area, this specifies the alignment for it.
   1487   uint32_t LocalsSlotsAlignmentBytes = 0;
   1488   // The entire spill locations area gets aligned to largest natural alignment
   1489   // of the variables that have a spill slot.
   1490   uint32_t SpillAreaAlignmentBytes = 0;
   1491   // For now, we don't have target-specific variables that need special
   1492   // treatment (no stack-slot-linked SpillVariable type).
   1493   std::function<bool(Variable *)> TargetVarHook = [](Variable *Var) {
   1494     static constexpr bool AssignStackSlot = false;
   1495     static constexpr bool DontAssignStackSlot = !AssignStackSlot;
   1496     if (llvm::isa<Variable64On32>(Var)) {
   1497       return DontAssignStackSlot;
   1498     }
   1499     return AssignStackSlot;
   1500   };
   1501 
   1502   // Compute the list of spilled variables and bounds for GlobalsSize, etc.
   1503   getVarStackSlotParams(SortedSpilledVariables, RegsUsed, &GlobalsSize,
   1504                         &SpillAreaSizeBytes, &SpillAreaAlignmentBytes,
   1505                         &LocalsSlotsAlignmentBytes, TargetVarHook);
   1506   uint32_t LocalsSpillAreaSize = SpillAreaSizeBytes;
   1507   SpillAreaSizeBytes += GlobalsSize;
   1508 
   1509   // Add push instructions for preserved registers. On ARM, "push" can push a
   1510   // whole list of GPRs via a bitmask (0-15). Unlike x86, ARM also has
   1511   // callee-saved float/vector registers.
   1512   //
   1513   // The "vpush" instruction can handle a whole list of float/vector registers,
   1514   // but it only handles contiguous sequences of registers by specifying the
   1515   // start and the length.
   1516   PreservedGPRs.reserve(CalleeSaves.size());
   1517   PreservedSRegs.reserve(CalleeSaves.size());
   1518 
   1519   // Consider FP and LR as callee-save / used as needed.
   1520   if (UsesFramePointer) {
   1521     if (RegsUsed[RegARM32::Reg_fp]) {
   1522       llvm::report_fatal_error("Frame pointer has been used.");
   1523     }
   1524     CalleeSaves[RegARM32::Reg_fp] = true;
   1525     RegsUsed[RegARM32::Reg_fp] = true;
   1526   }
   1527   if (!MaybeLeafFunc) {
   1528     CalleeSaves[RegARM32::Reg_lr] = true;
   1529     RegsUsed[RegARM32::Reg_lr] = true;
   1530   }
   1531 
   1532   // Make two passes over the used registers. The first pass records all the
   1533   // used registers -- and their aliases. Then, we figure out which GPRs and
   1534   // VFP S registers should be saved. We don't bother saving D/Q registers
   1535   // because their uses are recorded as S regs uses.
   1536   SmallBitVector ToPreserve(RegARM32::Reg_NUM);
   1537   for (SizeT i = 0; i < CalleeSaves.size(); ++i) {
   1538     if (NeedSandboxing && i == RegARM32::Reg_r9) {
   1539       // r9 is never updated in sandboxed code.
   1540       continue;
   1541     }
   1542     if (CalleeSaves[i] && RegsUsed[i]) {
   1543       ToPreserve |= RegisterAliases[i];
   1544     }
   1545   }
   1546 
   1547   uint32_t NumCallee = 0;
   1548   size_t PreservedRegsSizeBytes = 0;
   1549 
   1550   // RegClasses is a tuple of
   1551   //
   1552   // <First Register in Class, Last Register in Class, Vector of Save Registers>
   1553   //
   1554   // We use this tuple to figure out which register we should push/pop during
   1555   // prolog/epilog.
   1556   using RegClassType = std::tuple<uint32_t, uint32_t, VarList *>;
   1557   const RegClassType RegClasses[] = {
   1558       RegClassType(RegARM32::Reg_GPR_First, RegARM32::Reg_GPR_Last,
   1559                    &PreservedGPRs),
   1560       RegClassType(RegARM32::Reg_SREG_First, RegARM32::Reg_SREG_Last,
   1561                    &PreservedSRegs)};
   1562   for (const auto &RegClass : RegClasses) {
   1563     const uint32_t FirstRegInClass = std::get<0>(RegClass);
   1564     const uint32_t LastRegInClass = std::get<1>(RegClass);
   1565     VarList *const PreservedRegsInClass = std::get<2>(RegClass);
   1566     for (uint32_t Reg = FirstRegInClass; Reg <= LastRegInClass; ++Reg) {
   1567       if (!ToPreserve[Reg]) {
   1568         continue;
   1569       }
   1570       ++NumCallee;
   1571       Variable *PhysicalRegister = getPhysicalRegister(RegNumT::fromInt(Reg));
   1572       PreservedRegsSizeBytes +=
   1573           typeWidthInBytesOnStack(PhysicalRegister->getType());
   1574       PreservedRegsInClass->push_back(PhysicalRegister);
   1575     }
   1576   }
   1577 
   1578   Ctx->statsUpdateRegistersSaved(NumCallee);
   1579   if (!PreservedSRegs.empty())
   1580     _push(PreservedSRegs);
   1581   if (!PreservedGPRs.empty())
   1582     _push(PreservedGPRs);
   1583 
   1584   // Generate "mov FP, SP" if needed.
   1585   if (UsesFramePointer) {
   1586     Variable *FP = getPhysicalRegister(RegARM32::Reg_fp);
   1587     Variable *SP = getPhysicalRegister(RegARM32::Reg_sp);
   1588     _mov(FP, SP);
   1589     // Keep FP live for late-stage liveness analysis (e.g. asm-verbose mode).
   1590     Context.insert<InstFakeUse>(FP);
   1591   }
   1592 
   1593   // Align the variables area. SpillAreaPaddingBytes is the size of the region
   1594   // after the preserved registers and before the spill areas.
   1595   // LocalsSlotsPaddingBytes is the amount of padding between the globals and
   1596   // locals area if they are separate.
   1597   assert(SpillAreaAlignmentBytes <= ARM32_STACK_ALIGNMENT_BYTES);
   1598   assert(LocalsSlotsAlignmentBytes <= SpillAreaAlignmentBytes);
   1599   uint32_t SpillAreaPaddingBytes = 0;
   1600   uint32_t LocalsSlotsPaddingBytes = 0;
   1601   alignStackSpillAreas(PreservedRegsSizeBytes, SpillAreaAlignmentBytes,
   1602                        GlobalsSize, LocalsSlotsAlignmentBytes,
   1603                        &SpillAreaPaddingBytes, &LocalsSlotsPaddingBytes);
   1604   SpillAreaSizeBytes += SpillAreaPaddingBytes + LocalsSlotsPaddingBytes;
   1605   uint32_t GlobalsAndSubsequentPaddingSize =
   1606       GlobalsSize + LocalsSlotsPaddingBytes;
   1607 
   1608   // Adds the out args space to the stack, and align SP if necessary.
   1609   if (!NeedsStackAlignment) {
   1610     SpillAreaSizeBytes += MaxOutArgsSizeBytes;
   1611   } else {
   1612     uint32_t StackOffset = PreservedRegsSizeBytes;
   1613     uint32_t StackSize = applyStackAlignment(StackOffset + SpillAreaSizeBytes);
   1614     StackSize = applyStackAlignment(StackSize + MaxOutArgsSizeBytes);
   1615     SpillAreaSizeBytes = StackSize - StackOffset;
   1616   }
   1617 
   1618   // Combine fixed alloca with SpillAreaSize.
   1619   SpillAreaSizeBytes += FixedAllocaSizeBytes;
   1620 
   1621   // Generate "sub sp, SpillAreaSizeBytes"
   1622   if (SpillAreaSizeBytes) {
   1623     // Use the scratch register if needed to legalize the immediate.
   1624     Operand *SubAmount = legalize(Ctx->getConstantInt32(SpillAreaSizeBytes),
   1625                                   Legal_Reg | Legal_Flex, getReservedTmpReg());
   1626     Sandboxer(this).sub_sp(SubAmount);
   1627     if (FixedAllocaAlignBytes > ARM32_STACK_ALIGNMENT_BYTES) {
   1628       Sandboxer(this).align_sp(FixedAllocaAlignBytes);
   1629     }
   1630   }
   1631 
   1632   Ctx->statsUpdateFrameBytes(SpillAreaSizeBytes);
   1633 
   1634   // Fill in stack offsets for stack args, and copy args into registers for
   1635   // those that were register-allocated. Args are pushed right to left, so
   1636   // Arg[0] is closest to the stack/frame pointer.
   1637   Variable *FramePtr = getPhysicalRegister(getFrameOrStackReg());
   1638   size_t BasicFrameOffset = PreservedRegsSizeBytes;
   1639   if (!UsesFramePointer)
   1640     BasicFrameOffset += SpillAreaSizeBytes;
   1641 
   1642   materializeGotAddr(Node);
   1643 
   1644   const VarList &Args = Func->getArgs();
   1645   size_t InArgsSizeBytes = 0;
   1646   TargetARM32::CallingConv CC;
   1647   for (Variable *Arg : Args) {
   1648     RegNumT DummyReg;
   1649     const Type Ty = Arg->getType();
   1650 
   1651     // Skip arguments passed in registers.
   1652     if (isScalarIntegerType(Ty)) {
   1653       if (CC.argInGPR(Ty, &DummyReg)) {
   1654         continue;
   1655       }
   1656     } else {
   1657       if (CC.argInVFP(Ty, &DummyReg)) {
   1658         continue;
   1659       }
   1660     }
   1661     finishArgumentLowering(Arg, FramePtr, BasicFrameOffset, &InArgsSizeBytes);
   1662   }
   1663 
   1664   // Fill in stack offsets for locals.
   1665   assignVarStackSlots(SortedSpilledVariables, SpillAreaPaddingBytes,
   1666                       SpillAreaSizeBytes, GlobalsAndSubsequentPaddingSize,
   1667                       UsesFramePointer);
   1668   this->HasComputedFrame = true;
   1669 
   1670   if (BuildDefs::dump() && Func->isVerbose(IceV_Frame)) {
   1671     OstreamLocker _(Func->getContext());
   1672     Ostream &Str = Func->getContext()->getStrDump();
   1673 
   1674     Str << "Stack layout:\n";
   1675     uint32_t SPAdjustmentPaddingSize =
   1676         SpillAreaSizeBytes - LocalsSpillAreaSize -
   1677         GlobalsAndSubsequentPaddingSize - SpillAreaPaddingBytes -
   1678         MaxOutArgsSizeBytes;
   1679     Str << " in-args = " << InArgsSizeBytes << " bytes\n"
   1680         << " preserved registers = " << PreservedRegsSizeBytes << " bytes\n"
   1681         << " spill area padding = " << SpillAreaPaddingBytes << " bytes\n"
   1682         << " globals spill area = " << GlobalsSize << " bytes\n"
   1683         << " globals-locals spill areas intermediate padding = "
   1684         << GlobalsAndSubsequentPaddingSize - GlobalsSize << " bytes\n"
   1685         << " locals spill area = " << LocalsSpillAreaSize << " bytes\n"
   1686         << " SP alignment padding = " << SPAdjustmentPaddingSize << " bytes\n";
   1687 
   1688     Str << "Stack details:\n"
   1689         << " SP adjustment = " << SpillAreaSizeBytes << " bytes\n"
   1690         << " spill area alignment = " << SpillAreaAlignmentBytes << " bytes\n"
   1691         << " outgoing args size = " << MaxOutArgsSizeBytes << " bytes\n"
   1692         << " locals spill area alignment = " << LocalsSlotsAlignmentBytes
   1693         << " bytes\n"
   1694         << " is FP based = " << UsesFramePointer << "\n";
   1695   }
   1696 }
   1697 
   1698 void TargetARM32::addEpilog(CfgNode *Node) {
   1699   InstList &Insts = Node->getInsts();
   1700   InstList::reverse_iterator RI, E;
   1701   for (RI = Insts.rbegin(), E = Insts.rend(); RI != E; ++RI) {
   1702     if (llvm::isa<InstARM32Ret>(*RI))
   1703       break;
   1704   }
   1705   if (RI == E)
   1706     return;
   1707 
   1708   // Convert the reverse_iterator position into its corresponding (forward)
   1709   // iterator position.
   1710   InstList::iterator InsertPoint = reverseToForwardIterator(RI);
   1711   --InsertPoint;
   1712   Context.init(Node);
   1713   Context.setInsertPoint(InsertPoint);
   1714 
   1715   Variable *SP = getPhysicalRegister(RegARM32::Reg_sp);
   1716   if (UsesFramePointer) {
   1717     Variable *FP = getPhysicalRegister(RegARM32::Reg_fp);
   1718     // For late-stage liveness analysis (e.g. asm-verbose mode), adding a fake
   1719     // use of SP before the assignment of SP=FP keeps previous SP adjustments
   1720     // from being dead-code eliminated.
   1721     Context.insert<InstFakeUse>(SP);
   1722     Sandboxer(this).reset_sp(FP);
   1723   } else {
   1724     // add SP, SpillAreaSizeBytes
   1725     if (SpillAreaSizeBytes) {
   1726       // Use the scratch register if needed to legalize the immediate.
   1727       Operand *AddAmount =
   1728           legalize(Ctx->getConstantInt32(SpillAreaSizeBytes),
   1729                    Legal_Reg | Legal_Flex, getReservedTmpReg());
   1730       Sandboxer(this).add_sp(AddAmount);
   1731     }
   1732   }
   1733 
   1734   if (!PreservedGPRs.empty())
   1735     _pop(PreservedGPRs);
   1736   if (!PreservedSRegs.empty())
   1737     _pop(PreservedSRegs);
   1738 
   1739   if (!getFlags().getUseSandboxing())
   1740     return;
   1741 
   1742   // Change the original ret instruction into a sandboxed return sequence.
   1743   //
   1744   // bundle_lock
   1745   // bic lr, #0xc000000f
   1746   // bx lr
   1747   // bundle_unlock
   1748   //
   1749   // This isn't just aligning to the getBundleAlignLog2Bytes(). It needs to
   1750   // restrict to the lower 1GB as well.
   1751   Variable *LR = getPhysicalRegister(RegARM32::Reg_lr);
   1752   Variable *RetValue = nullptr;
   1753   if (RI->getSrcSize())
   1754     RetValue = llvm::cast<Variable>(RI->getSrc(0));
   1755 
   1756   Sandboxer(this).ret(LR, RetValue);
   1757 
   1758   RI->setDeleted();
   1759 }
   1760 
   1761 bool TargetARM32::isLegalMemOffset(Type Ty, int32_t Offset) const {
   1762   constexpr bool ZeroExt = false;
   1763   return OperandARM32Mem::canHoldOffset(Ty, ZeroExt, Offset);
   1764 }
   1765 
   1766 Variable *TargetARM32::PostLoweringLegalizer::newBaseRegister(
   1767     Variable *Base, int32_t Offset, RegNumT ScratchRegNum) {
   1768   // Legalize will likely need a movw/movt combination, but if the top bits are
   1769   // all 0 from negating the offset and subtracting, we could use that instead.
   1770   const bool ShouldSub = Offset != 0 && (-Offset & 0xFFFF0000) == 0;
   1771   Variable *ScratchReg = Target->makeReg(IceType_i32, ScratchRegNum);
   1772   if (ShouldSub) {
   1773     Operand *OffsetVal =
   1774         Target->legalize(Target->Ctx->getConstantInt32(-Offset),
   1775                          Legal_Reg | Legal_Flex, ScratchRegNum);
   1776     Target->_sub(ScratchReg, Base, OffsetVal);
   1777   } else {
   1778     Operand *OffsetVal =
   1779         Target->legalize(Target->Ctx->getConstantInt32(Offset),
   1780                          Legal_Reg | Legal_Flex, ScratchRegNum);
   1781     Target->_add(ScratchReg, Base, OffsetVal);
   1782   }
   1783 
   1784   if (ScratchRegNum == Target->getReservedTmpReg()) {
   1785     const bool BaseIsStackOrFramePtr =
   1786         Base->getRegNum() == Target->getFrameOrStackReg();
   1787     // There is currently no code path that would trigger this assertion, so we
   1788     // leave this assertion here in case it is ever violated. This is not a
   1789     // fatal error (thus the use of assert() and not llvm::report_fatal_error)
   1790     // as the program compiled by subzero will still work correctly.
   1791     assert(BaseIsStackOrFramePtr);
   1792     // Side-effect: updates TempBase to reflect the new Temporary.
   1793     if (BaseIsStackOrFramePtr) {
   1794       TempBaseReg = ScratchReg;
   1795       TempBaseOffset = Offset;
   1796     } else {
   1797       TempBaseReg = nullptr;
   1798       TempBaseOffset = 0;
   1799     }
   1800   }
   1801 
   1802   return ScratchReg;
   1803 }
   1804 
   1805 OperandARM32Mem *TargetARM32::PostLoweringLegalizer::createMemOperand(
   1806     Type Ty, Variable *Base, int32_t Offset, bool AllowOffsets) {
   1807   assert(!Base->isRematerializable());
   1808   if (Offset == 0 || (AllowOffsets && Target->isLegalMemOffset(Ty, Offset))) {
   1809     return OperandARM32Mem::create(
   1810         Target->Func, Ty, Base,
   1811         llvm::cast<ConstantInteger32>(Target->Ctx->getConstantInt32(Offset)),
   1812         OperandARM32Mem::Offset);
   1813   }
   1814 
   1815   if (!AllowOffsets || TempBaseReg == nullptr) {
   1816     newBaseRegister(Base, Offset, Target->getReservedTmpReg());
   1817   }
   1818 
   1819   int32_t OffsetDiff = Offset - TempBaseOffset;
   1820   assert(AllowOffsets || OffsetDiff == 0);
   1821 
   1822   if (!Target->isLegalMemOffset(Ty, OffsetDiff)) {
   1823     newBaseRegister(Base, Offset, Target->getReservedTmpReg());
   1824     OffsetDiff = 0;
   1825   }
   1826 
   1827   assert(!TempBaseReg->isRematerializable());
   1828   return OperandARM32Mem::create(
   1829       Target->Func, Ty, TempBaseReg,
   1830       llvm::cast<ConstantInteger32>(Target->Ctx->getConstantInt32(OffsetDiff)),
   1831       OperandARM32Mem::Offset);
   1832 }
   1833 
   1834 void TargetARM32::PostLoweringLegalizer::resetTempBaseIfClobberedBy(
   1835     const Inst *Instr) {
   1836   bool ClobbersTempBase = false;
   1837   if (TempBaseReg != nullptr) {
   1838     Variable *Dest = Instr->getDest();
   1839     if (llvm::isa<InstARM32Call>(Instr)) {
   1840       // The following assertion is an invariant, so we remove it from the if
   1841       // test. If the invariant is ever broken/invalidated/changed, remember
   1842       // to add it back to the if condition.
   1843       assert(TempBaseReg->getRegNum() == Target->getReservedTmpReg());
   1844       // The linker may need to clobber IP if the call is too far from PC. Thus,
   1845       // we assume IP will be overwritten.
   1846       ClobbersTempBase = true;
   1847     } else if (Dest != nullptr &&
   1848                Dest->getRegNum() == TempBaseReg->getRegNum()) {
   1849       // Register redefinition.
   1850       ClobbersTempBase = true;
   1851     }
   1852   }
   1853 
   1854   if (ClobbersTempBase) {
   1855     TempBaseReg = nullptr;
   1856     TempBaseOffset = 0;
   1857   }
   1858 }
   1859 
   1860 void TargetARM32::PostLoweringLegalizer::legalizeMov(InstARM32Mov *MovInstr) {
   1861   Variable *Dest = MovInstr->getDest();
   1862   assert(Dest != nullptr);
   1863   Type DestTy = Dest->getType();
   1864   assert(DestTy != IceType_i64);
   1865 
   1866   Operand *Src = MovInstr->getSrc(0);
   1867   Type SrcTy = Src->getType();
   1868   (void)SrcTy;
   1869   assert(SrcTy != IceType_i64);
   1870 
   1871   if (MovInstr->isMultiDest() || MovInstr->isMultiSource())
   1872     return;
   1873 
   1874   bool Legalized = false;
   1875   if (!Dest->hasReg()) {
   1876     auto *SrcR = llvm::cast<Variable>(Src);
   1877     assert(SrcR->hasReg());
   1878     assert(!SrcR->isRematerializable());
   1879     const int32_t Offset = Dest->getStackOffset();
   1880     // This is a _mov(Mem(), Variable), i.e., a store.
   1881     TargetARM32::Sandboxer(Target)
   1882         .str(SrcR, createMemOperand(DestTy, StackOrFrameReg, Offset),
   1883              MovInstr->getPredicate());
   1884     // _str() does not have a Dest, so we add a fake-def(Dest).
   1885     Target->Context.insert<InstFakeDef>(Dest);
   1886     Legalized = true;
   1887   } else if (auto *Var = llvm::dyn_cast<Variable>(Src)) {
   1888     if (Var->isRematerializable()) {
   1889       // This is equivalent to an x86 _lea(RematOffset(%esp/%ebp), Variable).
   1890 
   1891       // ExtraOffset is only needed for frame-pointer based frames as we have
   1892       // to account for spill storage.
   1893       const int32_t ExtraOffset = (Var->getRegNum() == Target->getFrameReg())
   1894                                       ? Target->getFrameFixedAllocaOffset()
   1895                                       : 0;
   1896 
   1897       const int32_t Offset = Var->getStackOffset() + ExtraOffset;
   1898       Variable *Base = Target->getPhysicalRegister(Var->getRegNum());
   1899       Variable *T = newBaseRegister(Base, Offset, Dest->getRegNum());
   1900       Target->_mov(Dest, T);
   1901       Legalized = true;
   1902     } else {
   1903       if (!Var->hasReg()) {
   1904         // This is a _mov(Variable, Mem()), i.e., a load.
   1905         const int32_t Offset = Var->getStackOffset();
   1906         TargetARM32::Sandboxer(Target)
   1907             .ldr(Dest, createMemOperand(DestTy, StackOrFrameReg, Offset),
   1908                  MovInstr->getPredicate());
   1909         Legalized = true;
   1910       }
   1911     }
   1912   }
   1913 
   1914   if (Legalized) {
   1915     if (MovInstr->isDestRedefined()) {
   1916       Target->_set_dest_redefined();
   1917     }
   1918     MovInstr->setDeleted();
   1919   }
   1920 }
   1921 
   1922 // ARM32 address modes:
   1923 //  ld/st i[8|16|32]: [reg], [reg +/- imm12], [pc +/- imm12],
   1924 //                    [reg +/- reg << shamt5]
   1925 //  ld/st f[32|64]  : [reg], [reg +/- imm8] , [pc +/- imm8]
   1926 //  ld/st vectors   : [reg]
   1927 //
   1928 // For now, we don't handle address modes with Relocatables.
   1929 namespace {
   1930 // MemTraits contains per-type valid address mode information.
   1931 #define X(tag, elementty, int_width, fp_width, uvec_width, svec_width, sbits,  \
   1932           ubits, rraddr, shaddr)                                               \
   1933   static_assert(!(shaddr) || rraddr, "Check ICETYPEARM32_TABLE::" #tag);
   1934 ICETYPEARM32_TABLE
   1935 #undef X
   1936 
   1937 static const struct {
   1938   int32_t ValidImmMask;
   1939   bool CanHaveImm;
   1940   bool CanHaveIndex;
   1941   bool CanHaveShiftedIndex;
   1942 } MemTraits[] = {
   1943 #define X(tag, elementty, int_width, fp_width, uvec_width, svec_width, sbits,  \
   1944           ubits, rraddr, shaddr)                                               \
   1945   { (1 << ubits) - 1, (ubits) > 0, rraddr, shaddr, }                           \
   1946   ,
   1947     ICETYPEARM32_TABLE
   1948 #undef X
   1949 };
   1950 static constexpr SizeT MemTraitsSize = llvm::array_lengthof(MemTraits);
   1951 } // end of anonymous namespace
   1952 
   1953 OperandARM32Mem *
   1954 TargetARM32::PostLoweringLegalizer::legalizeMemOperand(OperandARM32Mem *Mem,
   1955                                                        bool AllowOffsets) {
   1956   assert(!Mem->isRegReg() || !Mem->getIndex()->isRematerializable());
   1957   assert(
   1958       Mem->isRegReg() ||
   1959       Target->isLegalMemOffset(Mem->getType(), Mem->getOffset()->getValue()));
   1960 
   1961   bool Legalized = false;
   1962   Variable *Base = Mem->getBase();
   1963   int32_t Offset = Mem->isRegReg() ? 0 : Mem->getOffset()->getValue();
   1964   if (Base->isRematerializable()) {
   1965     const int32_t ExtraOffset = (Base->getRegNum() == Target->getFrameReg())
   1966                                     ? Target->getFrameFixedAllocaOffset()
   1967                                     : 0;
   1968     Offset += Base->getStackOffset() + ExtraOffset;
   1969     Base = Target->getPhysicalRegister(Base->getRegNum());
   1970     assert(!Base->isRematerializable());
   1971     Legalized = true;
   1972   }
   1973 
   1974   if (!Legalized && !Target->NeedSandboxing) {
   1975     return nullptr;
   1976   }
   1977 
   1978   if (!Mem->isRegReg()) {
   1979     return createMemOperand(Mem->getType(), Base, Offset, AllowOffsets);
   1980   }
   1981 
   1982   if (Target->NeedSandboxing) {
   1983     llvm::report_fatal_error("Reg-Reg address mode is not allowed.");
   1984   }
   1985 
   1986   assert(MemTraits[Mem->getType()].CanHaveIndex);
   1987 
   1988   if (Offset != 0) {
   1989     if (TempBaseReg == nullptr) {
   1990       Base = newBaseRegister(Base, Offset, Target->getReservedTmpReg());
   1991     } else {
   1992       uint32_t Imm8, Rotate;
   1993       const int32_t OffsetDiff = Offset - TempBaseOffset;
   1994       if (OffsetDiff == 0) {
   1995         Base = TempBaseReg;
   1996       } else if (OperandARM32FlexImm::canHoldImm(OffsetDiff, &Rotate, &Imm8)) {
   1997         auto *OffsetDiffF = OperandARM32FlexImm::create(
   1998             Target->Func, IceType_i32, Imm8, Rotate);
   1999         Target->_add(TempBaseReg, TempBaseReg, OffsetDiffF);
   2000         TempBaseOffset += OffsetDiff;
   2001         Base = TempBaseReg;
   2002       } else if (OperandARM32FlexImm::canHoldImm(-OffsetDiff, &Rotate, &Imm8)) {
   2003         auto *OffsetDiffF = OperandARM32FlexImm::create(
   2004             Target->Func, IceType_i32, Imm8, Rotate);
   2005         Target->_sub(TempBaseReg, TempBaseReg, OffsetDiffF);
   2006         TempBaseOffset += OffsetDiff;
   2007         Base = TempBaseReg;
   2008       } else {
   2009         Base = newBaseRegister(Base, Offset, Target->getReservedTmpReg());
   2010       }
   2011     }
   2012   }
   2013 
   2014   return OperandARM32Mem::create(Target->Func, Mem->getType(), Base,
   2015                                  Mem->getIndex(), Mem->getShiftOp(),
   2016                                  Mem->getShiftAmt(), Mem->getAddrMode());
   2017 }
   2018 
   2019 void TargetARM32::postLowerLegalization() {
   2020   // If a stack variable's frame offset doesn't fit, convert from:
   2021   //   ldr X, OFF[SP]
   2022   // to:
   2023   //   movw/movt TMP, OFF_PART
   2024   //   add TMP, TMP, SP
   2025   //   ldr X, OFF_MORE[TMP]
   2026   //
   2027   // This is safe because we have reserved TMP, and add for ARM does not
   2028   // clobber the flags register.
   2029   Func->dump("Before postLowerLegalization");
   2030   assert(hasComputedFrame());
   2031   // Do a fairly naive greedy clustering for now. Pick the first stack slot
   2032   // that's out of bounds and make a new base reg using the architecture's temp
   2033   // register. If that works for the next slot, then great. Otherwise, create a
   2034   // new base register, clobbering the previous base register. Never share a
   2035   // base reg across different basic blocks. This isn't ideal if local and
   2036   // multi-block variables are far apart and their references are interspersed.
   2037   // It may help to be more coordinated about assign stack slot numbers and may
   2038   // help to assign smaller offsets to higher-weight variables so that they
   2039   // don't depend on this legalization.
   2040   for (CfgNode *Node : Func->getNodes()) {
   2041     Context.init(Node);
   2042     // One legalizer per basic block, otherwise we would share the Temporary
   2043     // Base Register between basic blocks.
   2044     PostLoweringLegalizer Legalizer(this);
   2045     while (!Context.atEnd()) {
   2046       PostIncrLoweringContext PostIncrement(Context);
   2047       Inst *CurInstr = iteratorToInst(Context.getCur());
   2048 
   2049       // Check if the previous TempBaseReg is clobbered, and reset if needed.
   2050       Legalizer.resetTempBaseIfClobberedBy(CurInstr);
   2051 
   2052       if (auto *MovInstr = llvm::dyn_cast<InstARM32Mov>(CurInstr)) {
   2053         Legalizer.legalizeMov(MovInstr);
   2054       } else if (auto *LdrInstr = llvm::dyn_cast<InstARM32Ldr>(CurInstr)) {
   2055         if (OperandARM32Mem *LegalMem = Legalizer.legalizeMemOperand(
   2056                 llvm::cast<OperandARM32Mem>(LdrInstr->getSrc(0)))) {
   2057           Sandboxer(this)
   2058               .ldr(CurInstr->getDest(), LegalMem, LdrInstr->getPredicate());
   2059           CurInstr->setDeleted();
   2060         }
   2061       } else if (auto *LdrexInstr = llvm::dyn_cast<InstARM32Ldrex>(CurInstr)) {
   2062         constexpr bool DisallowOffsetsBecauseLdrex = false;
   2063         if (OperandARM32Mem *LegalMem = Legalizer.legalizeMemOperand(
   2064                 llvm::cast<OperandARM32Mem>(LdrexInstr->getSrc(0)),
   2065                 DisallowOffsetsBecauseLdrex)) {
   2066           Sandboxer(this)
   2067               .ldrex(CurInstr->getDest(), LegalMem, LdrexInstr->getPredicate());
   2068           CurInstr->setDeleted();
   2069         }
   2070       } else if (auto *StrInstr = llvm::dyn_cast<InstARM32Str>(CurInstr)) {
   2071         if (OperandARM32Mem *LegalMem = Legalizer.legalizeMemOperand(
   2072                 llvm::cast<OperandARM32Mem>(StrInstr->getSrc(1)))) {
   2073           Sandboxer(this).str(llvm::cast<Variable>(CurInstr->getSrc(0)),
   2074                               LegalMem, StrInstr->getPredicate());
   2075           CurInstr->setDeleted();
   2076         }
   2077       } else if (auto *StrexInstr = llvm::dyn_cast<InstARM32Strex>(CurInstr)) {
   2078         constexpr bool DisallowOffsetsBecauseStrex = false;
   2079         if (OperandARM32Mem *LegalMem = Legalizer.legalizeMemOperand(
   2080                 llvm::cast<OperandARM32Mem>(StrexInstr->getSrc(1)),
   2081                 DisallowOffsetsBecauseStrex)) {
   2082           Sandboxer(this).strex(CurInstr->getDest(),
   2083                                 llvm::cast<Variable>(CurInstr->getSrc(0)),
   2084                                 LegalMem, StrexInstr->getPredicate());
   2085           CurInstr->setDeleted();
   2086         }
   2087       }
   2088 
   2089       // Sanity-check: the Legalizer will either have no Temp, or it will be
   2090       // bound to IP.
   2091       Legalizer.assertNoTempOrAssignedToIP();
   2092     }
   2093   }
   2094 }
   2095 
   2096 Operand *TargetARM32::loOperand(Operand *Operand) {
   2097   assert(Operand->getType() == IceType_i64);
   2098   if (Operand->getType() != IceType_i64)
   2099     return Operand;
   2100   if (auto *Var64On32 = llvm::dyn_cast<Variable64On32>(Operand))
   2101     return Var64On32->getLo();
   2102   if (auto *Const = llvm::dyn_cast<ConstantInteger64>(Operand))
   2103     return Ctx->getConstantInt32(static_cast<uint32_t>(Const->getValue()));
   2104   if (auto *Mem = llvm::dyn_cast<OperandARM32Mem>(Operand)) {
   2105     // Conservatively disallow memory operands with side-effects (pre/post
   2106     // increment) in case of duplication.
   2107     assert(Mem->getAddrMode() == OperandARM32Mem::Offset ||
   2108            Mem->getAddrMode() == OperandARM32Mem::NegOffset);
   2109     if (Mem->isRegReg()) {
   2110       Variable *IndexR = legalizeToReg(Mem->getIndex());
   2111       return OperandARM32Mem::create(Func, IceType_i32, Mem->getBase(), IndexR,
   2112                                      Mem->getShiftOp(), Mem->getShiftAmt(),
   2113                                      Mem->getAddrMode());
   2114     } else {
   2115       return OperandARM32Mem::create(Func, IceType_i32, Mem->getBase(),
   2116                                      Mem->getOffset(), Mem->getAddrMode());
   2117     }
   2118   }
   2119   llvm::report_fatal_error("Unsupported operand type");
   2120   return nullptr;
   2121 }
   2122 
   2123 Operand *TargetARM32::hiOperand(Operand *Operand) {
   2124   assert(Operand->getType() == IceType_i64);
   2125   if (Operand->getType() != IceType_i64)
   2126     return Operand;
   2127   if (auto *Var64On32 = llvm::dyn_cast<Variable64On32>(Operand))
   2128     return Var64On32->getHi();
   2129   if (auto *Const = llvm::dyn_cast<ConstantInteger64>(Operand)) {
   2130     return Ctx->getConstantInt32(
   2131         static_cast<uint32_t>(Const->getValue() >> 32));
   2132   }
   2133   if (auto *Mem = llvm::dyn_cast<OperandARM32Mem>(Operand)) {
   2134     // Conservatively disallow memory operands with side-effects in case of
   2135     // duplication.
   2136     assert(Mem->getAddrMode() == OperandARM32Mem::Offset ||
   2137            Mem->getAddrMode() == OperandARM32Mem::NegOffset);
   2138     const Type SplitType = IceType_i32;
   2139     if (Mem->isRegReg()) {
   2140       // We have to make a temp variable T, and add 4 to either Base or Index.
   2141       // The Index may be shifted, so adding 4 can mean something else. Thus,
   2142       // prefer T := Base + 4, and use T as the new Base.
   2143       Variable *Base = Mem->getBase();
   2144       Constant *Four = Ctx->getConstantInt32(4);
   2145       Variable *NewBase = Func->makeVariable(Base->getType());
   2146       lowerArithmetic(InstArithmetic::create(Func, InstArithmetic::Add, NewBase,
   2147                                              Base, Four));
   2148       Variable *BaseR = legalizeToReg(NewBase);
   2149       Variable *IndexR = legalizeToReg(Mem->getIndex());
   2150       return OperandARM32Mem::create(Func, SplitType, BaseR, IndexR,
   2151                                      Mem->getShiftOp(), Mem->getShiftAmt(),
   2152                                      Mem->getAddrMode());
   2153     } else {
   2154       Variable *Base = Mem->getBase();
   2155       ConstantInteger32 *Offset = Mem->getOffset();
   2156       assert(!Utils::WouldOverflowAdd(Offset->getValue(), 4));
   2157       int32_t NextOffsetVal = Offset->getValue() + 4;
   2158       constexpr bool ZeroExt = false;
   2159       if (!OperandARM32Mem::canHoldOffset(SplitType, ZeroExt, NextOffsetVal)) {
   2160         // We have to make a temp variable and add 4 to either Base or Offset.
   2161         // If we add 4 to Offset, this will convert a non-RegReg addressing
   2162         // mode into a RegReg addressing mode. Since NaCl sandboxing disallows
   2163         // RegReg addressing modes, prefer adding to base and replacing
   2164         // instead. Thus we leave the old offset alone.
   2165         Constant *_4 = Ctx->getConstantInt32(4);
   2166         Variable *NewBase = Func->makeVariable(Base->getType());
   2167         lowerArithmetic(InstArithmetic::create(Func, InstArithmetic::Add,
   2168                                                NewBase, Base, _4));
   2169         Base = NewBase;
   2170       } else {
   2171         Offset =
   2172             llvm::cast<ConstantInteger32>(Ctx->getConstantInt32(NextOffsetVal));
   2173       }
   2174       Variable *BaseR = legalizeToReg(Base);
   2175       return OperandARM32Mem::create(Func, SplitType, BaseR, Offset,
   2176                                      Mem->getAddrMode());
   2177     }
   2178   }
   2179   llvm::report_fatal_error("Unsupported operand type");
   2180   return nullptr;
   2181 }
   2182 
   2183 SmallBitVector TargetARM32::getRegisterSet(RegSetMask Include,
   2184                                            RegSetMask Exclude) const {
   2185   SmallBitVector Registers(RegARM32::Reg_NUM);
   2186 
   2187   for (uint32_t i = 0; i < RegARM32::Reg_NUM; ++i) {
   2188     const auto &Entry = RegARM32::RegTable[i];
   2189     if (Entry.Scratch && (Include & RegSet_CallerSave))
   2190       Registers[i] = true;
   2191     if (Entry.Preserved && (Include & RegSet_CalleeSave))
   2192       Registers[i] = true;
   2193     if (Entry.StackPtr && (Include & RegSet_StackPointer))
   2194       Registers[i] = true;
   2195     if (Entry.FramePtr && (Include & RegSet_FramePointer))
   2196       Registers[i] = true;
   2197     if (Entry.Scratch && (Exclude & RegSet_CallerSave))
   2198       Registers[i] = false;
   2199     if (Entry.Preserved && (Exclude & RegSet_CalleeSave))
   2200       Registers[i] = false;
   2201     if (Entry.StackPtr && (Exclude & RegSet_StackPointer))
   2202       Registers[i] = false;
   2203     if (Entry.FramePtr && (Exclude & RegSet_FramePointer))
   2204       Registers[i] = false;
   2205   }
   2206 
   2207   return Registers;
   2208 }
   2209 
   2210 void TargetARM32::lowerAlloca(const InstAlloca *Instr) {
   2211   // Conservatively require the stack to be aligned. Some stack adjustment
   2212   // operations implemented below assume that the stack is aligned before the
   2213   // alloca. All the alloca code ensures that the stack alignment is preserved
   2214   // after the alloca. The stack alignment restriction can be relaxed in some
   2215   // cases.
   2216   NeedsStackAlignment = true;
   2217 
   2218   // For default align=0, set it to the real value 1, to avoid any
   2219   // bit-manipulation problems below.
   2220   const uint32_t AlignmentParam = std::max(1u, Instr->getAlignInBytes());
   2221 
   2222   // LLVM enforces power of 2 alignment.
   2223   assert(llvm::isPowerOf2_32(AlignmentParam));
   2224   assert(llvm::isPowerOf2_32(ARM32_STACK_ALIGNMENT_BYTES));
   2225 
   2226   const uint32_t Alignment =
   2227       std::max(AlignmentParam, ARM32_STACK_ALIGNMENT_BYTES);
   2228   const bool OverAligned = Alignment > ARM32_STACK_ALIGNMENT_BYTES;
   2229   const bool OptM1 = Func->getOptLevel() == Opt_m1;
   2230   const bool AllocaWithKnownOffset = Instr->getKnownFrameOffset();
   2231   const bool UseFramePointer =
   2232       hasFramePointer() || OverAligned || !AllocaWithKnownOffset || OptM1;
   2233 
   2234   if (UseFramePointer)
   2235     setHasFramePointer();
   2236 
   2237   Variable *SP = getPhysicalRegister(RegARM32::Reg_sp);
   2238   if (OverAligned) {
   2239     Sandboxer(this).align_sp(Alignment);
   2240   }
   2241 
   2242   Variable *Dest = Instr->getDest();
   2243   Operand *TotalSize = Instr->getSizeInBytes();
   2244 
   2245   if (const auto *ConstantTotalSize =
   2246           llvm::dyn_cast<ConstantInteger32>(TotalSize)) {
   2247     const uint32_t Value =
   2248         Utils::applyAlignment(ConstantTotalSize->getValue(), Alignment);
   2249     // Constant size alloca.
   2250     if (!UseFramePointer) {
   2251       // If we don't need a Frame Pointer, this alloca has a known offset to the
   2252       // stack pointer. We don't need adjust the stack pointer, nor assign any
   2253       // value to Dest, as Dest is rematerializable.
   2254       assert(Dest->isRematerializable());
   2255       FixedAllocaSizeBytes += Value;
   2256       Context.insert<InstFakeDef>(Dest);
   2257       return;
   2258     }
   2259 
   2260     // If a frame pointer is required, then we need to store the alloca'd result
   2261     // in Dest.
   2262     Operand *SubAmountRF =
   2263         legalize(Ctx->getConstantInt32(Value), Legal_Reg | Legal_Flex);
   2264     Sandboxer(this).sub_sp(SubAmountRF);
   2265   } else {
   2266     // Non-constant sizes need to be adjusted to the next highest multiple of
   2267     // the required alignment at runtime.
   2268     TotalSize = legalize(TotalSize, Legal_Reg | Legal_Flex);
   2269     Variable *T = makeReg(IceType_i32);
   2270     _mov(T, TotalSize);
   2271     Operand *AddAmount = legalize(Ctx->getConstantInt32(Alignment - 1));
   2272     _add(T, T, AddAmount);
   2273     alignRegisterPow2(T, Alignment);
   2274     Sandboxer(this).sub_sp(T);
   2275   }
   2276 
   2277   // Adds back a few bytes to SP to account for the out args area.
   2278   Variable *T = SP;
   2279   if (MaxOutArgsSizeBytes != 0) {
   2280     T = makeReg(getPointerType());
   2281     Operand *OutArgsSizeRF = legalize(
   2282         Ctx->getConstantInt32(MaxOutArgsSizeBytes), Legal_Reg | Legal_Flex);
   2283     _add(T, SP, OutArgsSizeRF);
   2284   }
   2285 
   2286   _mov(Dest, T);
   2287 }
   2288 
   2289 void TargetARM32::div0Check(Type Ty, Operand *SrcLo, Operand *SrcHi) {
   2290   if (isGuaranteedNonzeroInt(SrcLo) || isGuaranteedNonzeroInt(SrcHi))
   2291     return;
   2292   Variable *SrcLoReg = legalizeToReg(SrcLo);
   2293   switch (Ty) {
   2294   default:
   2295     llvm_unreachable(
   2296         ("Unexpected type in div0Check: " + typeStdString(Ty)).c_str());
   2297   case IceType_i8:
   2298   case IceType_i16: {
   2299     Operand *ShAmtImm = shAmtImm(32 - getScalarIntBitWidth(Ty));
   2300     Variable *T = makeReg(IceType_i32);
   2301     _lsls(T, SrcLoReg, ShAmtImm);
   2302     Context.insert<InstFakeUse>(T);
   2303   } break;
   2304   case IceType_i32: {
   2305     _tst(SrcLoReg, SrcLoReg);
   2306     break;
   2307   }
   2308   case IceType_i64: {
   2309     Variable *T = makeReg(IceType_i32);
   2310     _orrs(T, SrcLoReg, legalize(SrcHi, Legal_Reg | Legal_Flex));
   2311     // T isn't going to be used, but we need the side-effect of setting flags
   2312     // from this operation.
   2313     Context.insert<InstFakeUse>(T);
   2314   }
   2315   }
   2316   auto *Label = InstARM32Label::create(Func, this);
   2317   _br(Label, CondARM32::NE);
   2318   _trap();
   2319   Context.insert(Label);
   2320 }
   2321 
   2322 void TargetARM32::lowerIDivRem(Variable *Dest, Variable *T, Variable *Src0R,
   2323                                Operand *Src1, ExtInstr ExtFunc,
   2324                                DivInstr DivFunc, bool IsRemainder) {
   2325   div0Check(Dest->getType(), Src1, nullptr);
   2326   Variable *Src1R = legalizeToReg(Src1);
   2327   Variable *T0R = Src0R;
   2328   Variable *T1R = Src1R;
   2329   if (Dest->getType() != IceType_i32) {
   2330     T0R = makeReg(IceType_i32);
   2331     (this->*ExtFunc)(T0R, Src0R, CondARM32::AL);
   2332     T1R = makeReg(IceType_i32);
   2333     (this->*ExtFunc)(T1R, Src1R, CondARM32::AL);
   2334   }
   2335   if (hasCPUFeature(TargetARM32Features::HWDivArm)) {
   2336     (this->*DivFunc)(T, T0R, T1R, CondARM32::AL);
   2337     if (IsRemainder) {
   2338       Variable *T2 = makeReg(IceType_i32);
   2339       _mls(T2, T, T1R, T0R);
   2340       T = T2;
   2341     }
   2342     _mov(Dest, T);
   2343   } else {
   2344     llvm::report_fatal_error("div should have already been turned into a call");
   2345   }
   2346 }
   2347 
   2348 TargetARM32::SafeBoolChain
   2349 TargetARM32::lowerInt1Arithmetic(const InstArithmetic *Instr) {
   2350   Variable *Dest = Instr->getDest();
   2351   assert(Dest->getType() == IceType_i1);
   2352 
   2353   // So folding didn't work for Instr. Not a problem: We just need to
   2354   // materialize the Sources, and perform the operation. We create regular
   2355   // Variables (and not infinite-weight ones) because this call might recurse a
   2356   // lot, and we might end up with tons of infinite weight temporaries.
   2357   assert(Instr->getSrcSize() == 2);
   2358   Variable *Src0 = Func->makeVariable(IceType_i1);
   2359   SafeBoolChain Src0Safe = lowerInt1(Src0, Instr->getSrc(0));
   2360 
   2361   Operand *Src1 = Instr->getSrc(1);
   2362   SafeBoolChain Src1Safe = SBC_Yes;
   2363 
   2364   if (!llvm::isa<Constant>(Src1)) {
   2365     Variable *Src1V = Func->makeVariable(IceType_i1);
   2366     Src1Safe = lowerInt1(Src1V, Src1);
   2367     Src1 = Src1V;
   2368   }
   2369 
   2370   Variable *T = makeReg(IceType_i1);
   2371   Src0 = legalizeToReg(Src0);
   2372   Operand *Src1RF = legalize(Src1, Legal_Reg | Legal_Flex);
   2373   switch (Instr->getOp()) {
   2374   default:
   2375     // If this Unreachable is ever executed, add the offending operation to
   2376     // the list of valid consumers.
   2377     llvm::report_fatal_error("Unhandled i1 Op");
   2378   case InstArithmetic::And:
   2379     _and(T, Src0, Src1RF);
   2380     break;
   2381   case InstArithmetic::Or:
   2382     _orr(T, Src0, Src1RF);
   2383     break;
   2384   case InstArithmetic::Xor:
   2385     _eor(T, Src0, Src1RF);
   2386     break;
   2387   }
   2388   _mov(Dest, T);
   2389   return Src0Safe == SBC_Yes && Src1Safe == SBC_Yes ? SBC_Yes : SBC_No;
   2390 }
   2391 
   2392 namespace {
   2393 // NumericOperands is used during arithmetic/icmp lowering for constant folding.
   2394 // It holds the two sources operands, and maintains some state as to whether one
   2395 // of them is a constant. If one of the operands is a constant, then it will be
   2396 // be stored as the operation's second source, with a bit indicating whether the
   2397 // operands were swapped.
   2398 //
   2399 // The class is split into a base class with operand type-independent methods,
   2400 // and a derived, templated class, for each type of operand we want to fold
   2401 // constants for:
   2402 //
   2403 // NumericOperandsBase --> NumericOperands<ConstantFloat>
   2404 //                     --> NumericOperands<ConstantDouble>
   2405 //                     --> NumericOperands<ConstantInt32>
   2406 //
   2407 // NumericOperands<ConstantInt32> also exposes helper methods for emitting
   2408 // inverted/negated immediates.
   2409 class NumericOperandsBase {
   2410   NumericOperandsBase() = delete;
   2411   NumericOperandsBase(const NumericOperandsBase &) = delete;
   2412   NumericOperandsBase &operator=(const NumericOperandsBase &) = delete;
   2413 
   2414 public:
   2415   NumericOperandsBase(Operand *S0, Operand *S1)
   2416       : Src0(NonConstOperand(S0, S1)), Src1(ConstOperand(S0, S1)),
   2417         Swapped(Src0 == S1 && S0 != S1) {
   2418     assert(Src0 != nullptr);
   2419     assert(Src1 != nullptr);
   2420     assert(Src0 != Src1 || S0 == S1);
   2421   }
   2422 
   2423   bool hasConstOperand() const {
   2424     return llvm::isa<Constant>(Src1) && !llvm::isa<ConstantRelocatable>(Src1);
   2425   }
   2426 
   2427   bool swappedOperands() const { return Swapped; }
   2428 
   2429   Variable *src0R(TargetARM32 *Target) const {
   2430     return legalizeToReg(Target, Src0);
   2431   }
   2432 
   2433   Variable *unswappedSrc0R(TargetARM32 *Target) const {
   2434     return legalizeToReg(Target, Swapped ? Src1 : Src0);
   2435   }
   2436 
   2437   Operand *src1RF(TargetARM32 *Target) const {
   2438     return legalizeToRegOrFlex(Target, Src1);
   2439   }
   2440 
   2441   Variable *unswappedSrc1R(TargetARM32 *Target) const {
   2442     return legalizeToReg(Target, Swapped ? Src0 : Src1);
   2443   }
   2444 
   2445   Operand *src1() const { return Src1; }
   2446 
   2447 protected:
   2448   Operand *const Src0;
   2449   Operand *const Src1;
   2450   const bool Swapped;
   2451 
   2452   static Variable *legalizeToReg(TargetARM32 *Target, Operand *Src) {
   2453     return Target->legalizeToReg(Src);
   2454   }
   2455 
   2456   static Operand *legalizeToRegOrFlex(TargetARM32 *Target, Operand *Src) {
   2457     return Target->legalize(Src,
   2458                             TargetARM32::Legal_Reg | TargetARM32::Legal_Flex);
   2459   }
   2460 
   2461 private:
   2462   static Operand *NonConstOperand(Operand *S0, Operand *S1) {
   2463     if (!llvm::isa<Constant>(S0))
   2464       return S0;
   2465     if (!llvm::isa<Constant>(S1))
   2466       return S1;
   2467     if (llvm::isa<ConstantRelocatable>(S1) &&
   2468         !llvm::isa<ConstantRelocatable>(S0))
   2469       return S1;
   2470     return S0;
   2471   }
   2472 
   2473   static Operand *ConstOperand(Operand *S0, Operand *S1) {
   2474     if (!llvm::isa<Constant>(S0))
   2475       return S1;
   2476     if (!llvm::isa<Constant>(S1))
   2477       return S0;
   2478     if (llvm::isa<ConstantRelocatable>(S1) &&
   2479         !llvm::isa<ConstantRelocatable>(S0))
   2480       return S0;
   2481     return S1;
   2482   }
   2483 };
   2484 
   2485 template <typename C> class NumericOperands : public NumericOperandsBase {
   2486   NumericOperands() = delete;
   2487   NumericOperands(const NumericOperands &) = delete;
   2488   NumericOperands &operator=(const NumericOperands &) = delete;
   2489 
   2490 public:
   2491   NumericOperands(Operand *S0, Operand *S1) : NumericOperandsBase(S0, S1) {
   2492     assert(!hasConstOperand() || llvm::isa<C>(this->Src1));
   2493   }
   2494 
   2495   typename C::PrimType getConstantValue() const {
   2496     return llvm::cast<C>(Src1)->getValue();
   2497   }
   2498 };
   2499 
   2500 using FloatOperands = NumericOperands<ConstantFloat>;
   2501 using DoubleOperands = NumericOperands<ConstantDouble>;
   2502 
   2503 class Int32Operands : public NumericOperands<ConstantInteger32> {
   2504   Int32Operands() = delete;
   2505   Int32Operands(const Int32Operands &) = delete;
   2506   Int32Operands &operator=(const Int32Operands &) = delete;
   2507 
   2508 public:
   2509   Int32Operands(Operand *S0, Operand *S1) : NumericOperands(S0, S1) {}
   2510 
   2511   Operand *unswappedSrc1RShAmtImm(TargetARM32 *Target) const {
   2512     if (!swappedOperands() && hasConstOperand()) {
   2513       return Target->shAmtImm(getConstantValue() & 0x1F);
   2514     }
   2515     return legalizeToReg(Target, Swapped ? Src0 : Src1);
   2516   }
   2517 
   2518   bool isSrc1ImmediateZero() const {
   2519     if (!swappedOperands() && hasConstOperand()) {
   2520       return getConstantValue() == 0;
   2521     }
   2522     return false;
   2523   }
   2524 
   2525   bool immediateIsFlexEncodable() const {
   2526     uint32_t Rotate, Imm8;
   2527     return OperandARM32FlexImm::canHoldImm(getConstantValue(), &Rotate, &Imm8);
   2528   }
   2529 
   2530   bool negatedImmediateIsFlexEncodable() const {
   2531     uint32_t Rotate, Imm8;
   2532     return OperandARM32FlexImm::canHoldImm(
   2533         -static_cast<int32_t>(getConstantValue()), &Rotate, &Imm8);
   2534   }
   2535 
   2536   Operand *negatedSrc1F(TargetARM32 *Target) const {
   2537     return legalizeToRegOrFlex(Target,
   2538                                Target->getCtx()->getConstantInt32(
   2539                                    -static_cast<int32_t>(getConstantValue())));
   2540   }
   2541 
   2542   bool invertedImmediateIsFlexEncodable() const {
   2543     uint32_t Rotate, Imm8;
   2544     return OperandARM32FlexImm::canHoldImm(
   2545         ~static_cast<uint32_t>(getConstantValue()), &Rotate, &Imm8);
   2546   }
   2547 
   2548   Operand *invertedSrc1F(TargetARM32 *Target) const {
   2549     return legalizeToRegOrFlex(Target,
   2550                                Target->getCtx()->getConstantInt32(
   2551                                    ~static_cast<uint32_t>(getConstantValue())));
   2552   }
   2553 };
   2554 } // end of anonymous namespace
   2555 
   2556 void TargetARM32::preambleDivRem(const InstCall *Instr) {
   2557   Operand *Src1 = Instr->getArg(1);
   2558 
   2559   switch (Src1->getType()) {
   2560   default:
   2561     llvm::report_fatal_error("Invalid type for idiv.");
   2562   case IceType_i64: {
   2563     if (auto *C = llvm::dyn_cast<ConstantInteger64>(Src1)) {
   2564       if (C->getValue() == 0) {
   2565         _trap();
   2566         return;
   2567       }
   2568     }
   2569     div0Check(IceType_i64, loOperand(Src1), hiOperand(Src1));
   2570     return;
   2571   }
   2572   case IceType_i32: {
   2573     // Src0 and Src1 have already been appropriately extended to an i32, so we
   2574     // don't check for i8 and i16.
   2575     if (auto *C = llvm::dyn_cast<ConstantInteger32>(Src1)) {
   2576       if (C->getValue() == 0) {
   2577         _trap();
   2578         return;
   2579       }
   2580     }
   2581     div0Check(IceType_i32, Src1, nullptr);
   2582     return;
   2583   }
   2584   }
   2585 }
   2586 
   2587 void TargetARM32::lowerInt64Arithmetic(InstArithmetic::OpKind Op,
   2588                                        Variable *Dest, Operand *Src0,
   2589                                        Operand *Src1) {
   2590   Int32Operands SrcsLo(loOperand(Src0), loOperand(Src1));
   2591   Int32Operands SrcsHi(hiOperand(Src0), hiOperand(Src1));
   2592   assert(SrcsLo.swappedOperands() == SrcsHi.swappedOperands());
   2593   assert(SrcsLo.hasConstOperand() == SrcsHi.hasConstOperand());
   2594 
   2595   auto *DestLo = llvm::cast<Variable>(loOperand(Dest));
   2596   auto *DestHi = llvm::cast<Variable>(hiOperand(Dest));
   2597   Variable *T_Lo = makeReg(DestLo->getType());
   2598   Variable *T_Hi = makeReg(DestHi->getType());
   2599 
   2600   switch (Op) {
   2601   case InstArithmetic::_num:
   2602     llvm::report_fatal_error("Unknown arithmetic operator");
   2603     return;
   2604   case InstArithmetic::Add: {
   2605     Variable *Src0LoR = SrcsLo.src0R(this);
   2606     Operand *Src1LoRF = SrcsLo.src1RF(this);
   2607     Variable *Src0HiR = SrcsHi.src0R(this);
   2608     Operand *Src1HiRF = SrcsHi.src1RF(this);
   2609     _adds(T_Lo, Src0LoR, Src1LoRF);
   2610     _mov(DestLo, T_Lo);
   2611     _adc(T_Hi, Src0HiR, Src1HiRF);
   2612     _mov(DestHi, T_Hi);
   2613     return;
   2614   }
   2615   case InstArithmetic::And: {
   2616     Variable *Src0LoR = SrcsLo.src0R(this);
   2617     Operand *Src1LoRF = SrcsLo.src1RF(this);
   2618     Variable *Src0HiR = SrcsHi.src0R(this);
   2619     Operand *Src1HiRF = SrcsHi.src1RF(this);
   2620     _and(T_Lo, Src0LoR, Src1LoRF);
   2621     _mov(DestLo, T_Lo);
   2622     _and(T_Hi, Src0HiR, Src1HiRF);
   2623     _mov(DestHi, T_Hi);
   2624     return;
   2625   }
   2626   case InstArithmetic::Or: {
   2627     Variable *Src0LoR = SrcsLo.src0R(this);
   2628     Operand *Src1LoRF = SrcsLo.src1RF(this);
   2629     Variable *Src0HiR = SrcsHi.src0R(this);
   2630     Operand *Src1HiRF = SrcsHi.src1RF(this);
   2631     _orr(T_Lo, Src0LoR, Src1LoRF);
   2632     _mov(DestLo, T_Lo);
   2633     _orr(T_Hi, Src0HiR, Src1HiRF);
   2634     _mov(DestHi, T_Hi);
   2635     return;
   2636   }
   2637   case InstArithmetic::Xor: {
   2638     Variable *Src0LoR = SrcsLo.src0R(this);
   2639     Operand *Src1LoRF = SrcsLo.src1RF(this);
   2640     Variable *Src0HiR = SrcsHi.src0R(this);
   2641     Operand *Src1HiRF = SrcsHi.src1RF(this);
   2642     _eor(T_Lo, Src0LoR, Src1LoRF);
   2643     _mov(DestLo, T_Lo);
   2644     _eor(T_Hi, Src0HiR, Src1HiRF);
   2645     _mov(DestHi, T_Hi);
   2646     return;
   2647   }
   2648   case InstArithmetic::Sub: {
   2649     Variable *Src0LoR = SrcsLo.src0R(this);
   2650     Operand *Src1LoRF = SrcsLo.src1RF(this);
   2651     Variable *Src0HiR = SrcsHi.src0R(this);
   2652     Operand *Src1HiRF = SrcsHi.src1RF(this);
   2653     if (SrcsLo.swappedOperands()) {
   2654       _rsbs(T_Lo, Src0LoR, Src1LoRF);
   2655       _mov(DestLo, T_Lo);
   2656       _rsc(T_Hi, Src0HiR, Src1HiRF);
   2657       _mov(DestHi, T_Hi);
   2658     } else {
   2659       _subs(T_Lo, Src0LoR, Src1LoRF);
   2660       _mov(DestLo, T_Lo);
   2661       _sbc(T_Hi, Src0HiR, Src1HiRF);
   2662       _mov(DestHi, T_Hi);
   2663     }
   2664     return;
   2665   }
   2666   case InstArithmetic::Mul: {
   2667     // GCC 4.8 does:
   2668     // a=b*c ==>
   2669     //   t_acc =(mul) (b.lo * c.hi)
   2670     //   t_acc =(mla) (c.lo * b.hi) + t_acc
   2671     //   t.hi,t.lo =(umull) b.lo * c.lo
   2672     //   t.hi += t_acc
   2673     //   a.lo = t.lo
   2674     //   a.hi = t.hi
   2675     //
   2676     // LLVM does:
   2677     //   t.hi,t.lo =(umull) b.lo * c.lo
   2678     //   t.hi =(mla) (b.lo * c.hi) + t.hi
   2679     //   t.hi =(mla) (b.hi * c.lo) + t.hi
   2680     //   a.lo = t.lo
   2681     //   a.hi = t.hi
   2682     //
   2683     // LLVM's lowering has fewer instructions, but more register pressure:
   2684     // t.lo is live from beginning to end, while GCC delays the two-dest
   2685     // instruction till the end, and kills c.hi immediately.
   2686     Variable *T_Acc = makeReg(IceType_i32);
   2687     Variable *T_Acc1 = makeReg(IceType_i32);
   2688     Variable *T_Hi1 = makeReg(IceType_i32);
   2689     Variable *Src0RLo = SrcsLo.unswappedSrc0R(this);
   2690     Variable *Src0RHi = SrcsHi.unswappedSrc0R(this);
   2691     Variable *Src1RLo = SrcsLo.unswappedSrc1R(this);
   2692     Variable *Src1RHi = SrcsHi.unswappedSrc1R(this);
   2693     _mul(T_Acc, Src0RLo, Src1RHi);
   2694     _mla(T_Acc1, Src1RLo, Src0RHi, T_Acc);
   2695     _umull(T_Lo, T_Hi1, Src0RLo, Src1RLo);
   2696     _add(T_Hi, T_Hi1, T_Acc1);
   2697     _mov(DestLo, T_Lo);
   2698     _mov(DestHi, T_Hi);
   2699     return;
   2700   }
   2701   case InstArithmetic::Shl: {
   2702     if (!SrcsLo.swappedOperands() && SrcsLo.hasConstOperand()) {
   2703       Variable *Src0RLo = SrcsLo.src0R(this);
   2704       // Truncating the ShAmt to [0, 63] because that's what ARM does anyway.
   2705       const int32_t ShAmtImm = SrcsLo.getConstantValue() & 0x3F;
   2706       if (ShAmtImm == 0) {
   2707         _mov(DestLo, Src0RLo);
   2708         _mov(DestHi, SrcsHi.src0R(this));
   2709         return;
   2710       }
   2711 
   2712       if (ShAmtImm >= 32) {
   2713         if (ShAmtImm == 32) {
   2714           _mov(DestHi, Src0RLo);
   2715         } else {
   2716           Operand *ShAmtOp = shAmtImm(ShAmtImm - 32);
   2717           _lsl(T_Hi, Src0RLo, ShAmtOp);
   2718           _mov(DestHi, T_Hi);
   2719         }
   2720 
   2721         Operand *_0 =
   2722             legalize(Ctx->getConstantZero(IceType_i32), Legal_Reg | Legal_Flex);
   2723         _mov(T_Lo, _0);
   2724         _mov(DestLo, T_Lo);
   2725         return;
   2726       }
   2727 
   2728       Variable *Src0RHi = SrcsHi.src0R(this);
   2729       Operand *ShAmtOp = shAmtImm(ShAmtImm);
   2730       Operand *ComplShAmtOp = shAmtImm(32 - ShAmtImm);
   2731       _lsl(T_Hi, Src0RHi, ShAmtOp);
   2732       _orr(T_Hi, T_Hi,
   2733            OperandARM32FlexReg::create(Func, IceType_i32, Src0RLo,
   2734                                        OperandARM32::LSR, ComplShAmtOp));
   2735       _mov(DestHi, T_Hi);
   2736 
   2737       _lsl(T_Lo, Src0RLo, ShAmtOp);
   2738       _mov(DestLo, T_Lo);
   2739       return;
   2740     }
   2741 
   2742     // a=b<<c ==>
   2743     // pnacl-llc does:
   2744     // mov     t_b.lo, b.lo
   2745     // mov     t_b.hi, b.hi
   2746     // mov     t_c.lo, c.lo
   2747     // rsb     T0, t_c.lo, #32
   2748     // lsr     T1, t_b.lo, T0
   2749     // orr     t_a.hi, T1, t_b.hi, lsl t_c.lo
   2750     // sub     T2, t_c.lo, #32
   2751     // cmp     T2, #0
   2752     // lslge   t_a.hi, t_b.lo, T2
   2753     // lsl     t_a.lo, t_b.lo, t_c.lo
   2754     // mov     a.lo, t_a.lo
   2755     // mov     a.hi, t_a.hi
   2756     //
   2757     // GCC 4.8 does:
   2758     // sub t_c1, c.lo, #32
   2759     // lsl t_hi, b.hi, c.lo
   2760     // orr t_hi, t_hi, b.lo, lsl t_c1
   2761     // rsb t_c2, c.lo, #32
   2762     // orr t_hi, t_hi, b.lo, lsr t_c2
   2763     // lsl t_lo, b.lo, c.lo
   2764     // a.lo = t_lo
   2765     // a.hi = t_hi
   2766     //
   2767     // These are incompatible, therefore we mimic pnacl-llc.
   2768     // Can be strength-reduced for constant-shifts, but we don't do that for
   2769     // now.
   2770     // Given the sub/rsb T_C, C.lo, #32, one of the T_C will be negative. On
   2771     // ARM, shifts only take the lower 8 bits of the shift register, and
   2772     // saturate to the range 0-32, so the negative value will saturate to 32.
   2773     Operand *_32 = legalize(Ctx->getConstantInt32(32), Legal_Reg | Legal_Flex);
   2774     Operand *_0 =
   2775         legalize(Ctx->getConstantZero(IceType_i32), Legal_Reg | Legal_Flex);
   2776     Variable *T0 = makeReg(IceType_i32);
   2777     Variable *T1 = makeReg(IceType_i32);
   2778     Variable *T2 = makeReg(IceType_i32);
   2779     Variable *TA_Hi = makeReg(IceType_i32);
   2780     Variable *TA_Lo = makeReg(IceType_i32);
   2781     Variable *Src0RLo = SrcsLo.unswappedSrc0R(this);
   2782     Variable *Src0RHi = SrcsHi.unswappedSrc0R(this);
   2783     Variable *Src1RLo = SrcsLo.unswappedSrc1R(this);
   2784     _rsb(T0, Src1RLo, _32);
   2785     _lsr(T1, Src0RLo, T0);
   2786     _orr(TA_Hi, T1, OperandARM32FlexReg::create(Func, IceType_i32, Src0RHi,
   2787                                                 OperandARM32::LSL, Src1RLo));
   2788     _sub(T2, Src1RLo, _32);
   2789     _cmp(T2, _0);
   2790     _lsl(TA_Hi, Src0RLo, T2, CondARM32::GE);
   2791     _set_dest_redefined();
   2792     _lsl(TA_Lo, Src0RLo, Src1RLo);
   2793     _mov(DestLo, TA_Lo);
   2794     _mov(DestHi, TA_Hi);
   2795     return;
   2796   }
   2797   case InstArithmetic::Lshr:
   2798   case InstArithmetic::Ashr: {
   2799     const bool ASR = Op == InstArithmetic::Ashr;
   2800     if (!SrcsLo.swappedOperands() && SrcsLo.hasConstOperand()) {
   2801       Variable *Src0RHi = SrcsHi.src0R(this);
   2802       // Truncating the ShAmt to [0, 63] because that's what ARM does anyway.
   2803       const int32_t ShAmt = SrcsLo.getConstantValue() & 0x3F;
   2804       if (ShAmt == 0) {
   2805         _mov(DestHi, Src0RHi);
   2806         _mov(DestLo, SrcsLo.src0R(this));
   2807         return;
   2808       }
   2809 
   2810       if (ShAmt >= 32) {
   2811         if (ShAmt == 32) {
   2812           _mov(DestLo, Src0RHi);
   2813         } else {
   2814           Operand *ShAmtImm = shAmtImm(ShAmt - 32);
   2815           if (ASR) {
   2816             _asr(T_Lo, Src0RHi, ShAmtImm);
   2817           } else {
   2818             _lsr(T_Lo, Src0RHi, ShAmtImm);
   2819           }
   2820           _mov(DestLo, T_Lo);
   2821         }
   2822 
   2823         if (ASR) {
   2824           Operand *_31 = shAmtImm(31);
   2825           _asr(T_Hi, Src0RHi, _31);
   2826         } else {
   2827           Operand *_0 = legalize(Ctx->getConstantZero(IceType_i32),
   2828                                  Legal_Reg | Legal_Flex);
   2829           _mov(T_Hi, _0);
   2830         }
   2831         _mov(DestHi, T_Hi);
   2832         return;
   2833       }
   2834 
   2835       Variable *Src0RLo = SrcsLo.src0R(this);
   2836       Operand *ShAmtImm = shAmtImm(ShAmt);
   2837       Operand *ComplShAmtImm = shAmtImm(32 - ShAmt);
   2838       _lsr(T_Lo, Src0RLo, ShAmtImm);
   2839       _orr(T_Lo, T_Lo,
   2840            OperandARM32FlexReg::create(Func, IceType_i32, Src0RHi,
   2841                                        OperandARM32::LSL, ComplShAmtImm));
   2842       _mov(DestLo, T_Lo);
   2843 
   2844       if (ASR) {
   2845         _asr(T_Hi, Src0RHi, ShAmtImm);
   2846       } else {
   2847         _lsr(T_Hi, Src0RHi, ShAmtImm);
   2848       }
   2849       _mov(DestHi, T_Hi);
   2850       return;
   2851     }
   2852 
   2853     // a=b>>c
   2854     // pnacl-llc does:
   2855     // mov        t_b.lo, b.lo
   2856     // mov        t_b.hi, b.hi
   2857     // mov        t_c.lo, c.lo
   2858     // lsr        T0, t_b.lo, t_c.lo
   2859     // rsb        T1, t_c.lo, #32
   2860     // orr        t_a.lo, T0, t_b.hi, lsl T1
   2861     // sub        T2, t_c.lo, #32
   2862     // cmp        T2, #0
   2863     // [al]srge   t_a.lo, t_b.hi, T2
   2864     // [al]sr     t_a.hi, t_b.hi, t_c.lo
   2865     // mov        a.lo, t_a.lo
   2866     // mov        a.hi, t_a.hi
   2867     //
   2868     // GCC 4.8 does (lsr):
   2869     // rsb        t_c1, c.lo, #32
   2870     // lsr        t_lo, b.lo, c.lo
   2871     // orr        t_lo, t_lo, b.hi, lsl t_c1
   2872     // sub        t_c2, c.lo, #32
   2873     // orr        t_lo, t_lo, b.hi, lsr t_c2
   2874     // lsr        t_hi, b.hi, c.lo
   2875     // mov        a.lo, t_lo
   2876     // mov        a.hi, t_hi
   2877     //
   2878     // These are incompatible, therefore we mimic pnacl-llc.
   2879     Operand *_32 = legalize(Ctx->getConstantInt32(32), Legal_Reg | Legal_Flex);
   2880     Operand *_0 =
   2881         legalize(Ctx->getConstantZero(IceType_i32), Legal_Reg | Legal_Flex);
   2882     Variable *T0 = makeReg(IceType_i32);
   2883     Variable *T1 = makeReg(IceType_i32);
   2884     Variable *T2 = makeReg(IceType_i32);
   2885     Variable *TA_Lo = makeReg(IceType_i32);
   2886     Variable *TA_Hi = makeReg(IceType_i32);
   2887     Variable *Src0RLo = SrcsLo.unswappedSrc0R(this);
   2888     Variable *Src0RHi = SrcsHi.unswappedSrc0R(this);
   2889     Variable *Src1RLo = SrcsLo.unswappedSrc1R(this);
   2890     _lsr(T0, Src0RLo, Src1RLo);
   2891     _rsb(T1, Src1RLo, _32);
   2892     _orr(TA_Lo, T0, OperandARM32FlexReg::create(Func, IceType_i32, Src0RHi,
   2893                                                 OperandARM32::LSL, T1));
   2894     _sub(T2, Src1RLo, _32);
   2895     _cmp(T2, _0);
   2896     if (ASR) {
   2897       _asr(TA_Lo, Src0RHi, T2, CondARM32::GE);
   2898       _set_dest_redefined();
   2899       _asr(TA_Hi, Src0RHi, Src1RLo);
   2900     } else {
   2901       _lsr(TA_Lo, Src0RHi, T2, CondARM32::GE);
   2902       _set_dest_redefined();
   2903       _lsr(TA_Hi, Src0RHi, Src1RLo);
   2904     }
   2905     _mov(DestLo, TA_Lo);
   2906     _mov(DestHi, TA_Hi);
   2907     return;
   2908   }
   2909   case InstArithmetic::Fadd:
   2910   case InstArithmetic::Fsub:
   2911   case InstArithmetic::Fmul:
   2912   case InstArithmetic::Fdiv:
   2913   case InstArithmetic::Frem:
   2914     llvm::report_fatal_error("FP instruction with i64 type");
   2915     return;
   2916   case InstArithmetic::Udiv:
   2917   case InstArithmetic::Sdiv:
   2918   case InstArithmetic::Urem:
   2919   case InstArithmetic::Srem:
   2920     llvm::report_fatal_error("Call-helper-involved instruction for i64 type "
   2921                              "should have already been handled before");
   2922     return;
   2923   }
   2924 }
   2925 
   2926 namespace {
   2927 // StrengthReduction is a namespace with the strength reduction machinery. The
   2928 // entry point is the StrengthReduction::tryToOptimize method. It returns true
   2929 // if the optimization can be performed, and false otherwise.
   2930 //
   2931 // If the optimization can be performed, tryToOptimize sets its NumOperations
   2932 // parameter to the number of shifts that are needed to perform the
   2933 // multiplication; and it sets the Operations parameter with <ShAmt, AddOrSub>
   2934 // tuples that describe how to materialize the multiplication.
   2935 //
   2936 // The algorithm finds contiguous 1s in the Multiplication source, and uses one
   2937 // or two shifts to materialize it. A sequence of 1s, e.g.,
   2938 //
   2939 //                  M           N
   2940 //   ...00000000000011111...111110000000...
   2941 //
   2942 // is materializable with (1 << (M + 1)) - (1 << N):
   2943 //
   2944 //   ...00000000000100000...000000000000...      [1 << (M + 1)]
   2945 //   ...00000000000000000...000010000000... (-)  [1 << N]
   2946 //   --------------------------------------
   2947 //   ...00000000000011111...111110000000...
   2948 //
   2949 // And a single bit set, which is just a left shift.
   2950 namespace StrengthReduction {
   2951 enum AggregationOperation {
   2952   AO_Invalid,
   2953   AO_Add,
   2954   AO_Sub,
   2955 };
   2956 
   2957 // AggregateElement is a glorified <ShAmt, AddOrSub> tuple.
   2958 class AggregationElement {
   2959   AggregationElement(const AggregationElement &) = delete;
   2960 
   2961 public:
   2962   AggregationElement() = default;
   2963   AggregationElement &operator=(const AggregationElement &) = default;
   2964   AggregationElement(AggregationOperation Op, uint32_t ShAmt)
   2965       : Op(Op), ShAmt(ShAmt) {}
   2966 
   2967   Operand *createShiftedOperand(Cfg *Func, Variable *OpR) const {
   2968     assert(OpR->mustHaveReg());
   2969     if (ShAmt == 0) {
   2970       return OpR;
   2971     }
   2972     return OperandARM32FlexReg::create(
   2973         Func, IceType_i32, OpR, OperandARM32::LSL,
   2974         OperandARM32ShAmtImm::create(
   2975             Func, llvm::cast<ConstantInteger32>(
   2976                       Func->getContext()->getConstantInt32(ShAmt))));
   2977   }
   2978 
   2979   bool aggregateWithAdd() const {
   2980     switch (Op) {
   2981     case AO_Invalid:
   2982       llvm::report_fatal_error("Invalid Strength Reduction Operations.");
   2983     case AO_Add:
   2984       return true;
   2985     case AO_Sub:
   2986       return false;
   2987     }
   2988     llvm_unreachable("(silence g++ warning)");
   2989   }
   2990 
   2991   uint32_t shAmt() const { return ShAmt; }
   2992 
   2993 private:
   2994   AggregationOperation Op = AO_Invalid;
   2995   uint32_t ShAmt;
   2996 };
   2997 
   2998 // [RangeStart, RangeEnd] is a range of 1s in Src.
   2999 template <std::size_t N>
   3000 bool addOperations(uint32_t RangeStart, uint32_t RangeEnd, SizeT *NumOperations,
   3001                    std::array<AggregationElement, N> *Operations) {
   3002   assert(*NumOperations < N);
   3003   if (RangeStart == RangeEnd) {
   3004     // Single bit set:
   3005     // Src           : 0...00010...
   3006     // RangeStart    :        ^
   3007     // RangeEnd      :        ^
   3008     // NegSrc        : 0...00001...
   3009     (*Operations)[*NumOperations] = AggregationElement(AO_Add, RangeStart);
   3010     ++(*NumOperations);
   3011     return true;
   3012   }
   3013 
   3014   // Sequence of 1s: (two operations required.)
   3015   // Src           : 0...00011...110...
   3016   // RangeStart    :        ^
   3017   // RangeEnd      :              ^
   3018   // NegSrc        : 0...00000...001...
   3019   if (*NumOperations + 1 >= N) {
   3020     return false;
   3021   }
   3022   (*Operations)[*NumOperations] = AggregationElement(AO_Add, RangeStart + 1);
   3023   ++(*NumOperations);
   3024   (*Operations)[*NumOperations] = AggregationElement(AO_Sub, RangeEnd);
   3025   ++(*NumOperations);
   3026   return true;
   3027 }
   3028 
   3029 // tryToOptmize scans Src looking for sequences of 1s (including the unitary bit
   3030 // 1 surrounded by zeroes.
   3031 template <std::size_t N>
   3032 bool tryToOptimize(uint32_t Src, SizeT *NumOperations,
   3033                    std::array<AggregationElement, N> *Operations) {
   3034   constexpr uint32_t SrcSizeBits = sizeof(Src) * CHAR_BIT;
   3035   uint32_t NegSrc = ~Src;
   3036 
   3037   *NumOperations = 0;
   3038   while (Src != 0 && *NumOperations < N) {
   3039     // Each step of the algorithm:
   3040     //   * finds L, the last bit set in Src;
   3041     //   * clears all the upper bits in NegSrc up to bit L;
   3042     //   * finds nL, the last bit set in NegSrc;
   3043     //   * clears all the upper bits in Src up to bit nL;
   3044     //
   3045     // if L == nL + 1, then a unitary 1 was found in Src. Otherwise, a sequence
   3046     // of 1s starting at L, and ending at nL + 1, was found.
   3047     const uint32_t SrcLastBitSet = llvm::findLastSet(Src);
   3048     const uint32_t NegSrcClearMask =
   3049         (SrcLastBitSet == 0) ? 0
   3050                              : (0xFFFFFFFFu) >> (SrcSizeBits - SrcLastBitSet);
   3051     NegSrc &= NegSrcClearMask;
   3052     if (NegSrc == 0) {
   3053       if (addOperations(SrcLastBitSet, 0, NumOperations, Operations)) {
   3054         return true;
   3055       }
   3056       return false;
   3057     }
   3058     const uint32_t NegSrcLastBitSet = llvm::findLastSet(NegSrc);
   3059     assert(NegSrcLastBitSet < SrcLastBitSet);
   3060     const uint32_t SrcClearMask =
   3061         (NegSrcLastBitSet == 0) ? 0 : (0xFFFFFFFFu) >>
   3062                                           (SrcSizeBits - NegSrcLastBitSet);
   3063     Src &= SrcClearMask;
   3064     if (!addOperations(SrcLastBitSet, NegSrcLastBitSet + 1, NumOperations,
   3065                        Operations)) {
   3066       return false;
   3067     }
   3068   }
   3069 
   3070   return Src == 0;
   3071 }
   3072 } // end of namespace StrengthReduction
   3073 } // end of anonymous namespace
   3074 
   3075 void TargetARM32::lowerArithmetic(const InstArithmetic *Instr) {
   3076   Variable *Dest = Instr->getDest();
   3077 
   3078   if (Dest->isRematerializable()) {
   3079     Context.insert<InstFakeDef>(Dest);
   3080     return;
   3081   }
   3082 
   3083   Type DestTy = Dest->getType();
   3084   if (DestTy == IceType_i1) {
   3085     lowerInt1Arithmetic(Instr);
   3086     return;
   3087   }
   3088 
   3089   Operand *Src0 = legalizeUndef(Instr->getSrc(0));
   3090   Operand *Src1 = legalizeUndef(Instr->getSrc(1));
   3091   if (DestTy == IceType_i64) {
   3092     lowerInt64Arithmetic(Instr->getOp(), Instr->getDest(), Src0, Src1);
   3093     return;
   3094   }
   3095 
   3096   if (isVectorType(DestTy)) {
   3097     switch (Instr->getOp()) {
   3098     default:
   3099       UnimplementedLoweringError(this, Instr);
   3100       return;
   3101     // Explicitly whitelist vector instructions we have implemented/enabled.
   3102     case InstArithmetic::Add:
   3103     case InstArithmetic::And:
   3104     case InstArithmetic::Ashr:
   3105     case InstArithmetic::Fadd:
   3106     case InstArithmetic::Fmul:
   3107     case InstArithmetic::Fsub:
   3108     case InstArithmetic::Lshr:
   3109     case InstArithmetic::Mul:
   3110     case InstArithmetic::Or:
   3111     case InstArithmetic::Shl:
   3112     case InstArithmetic::Sub:
   3113     case InstArithmetic::Xor:
   3114       break;
   3115     }
   3116   }
   3117 
   3118   Variable *T = makeReg(DestTy);
   3119 
   3120   // * Handle div/rem separately. They require a non-legalized Src1 to inspect
   3121   // whether or not Src1 is a non-zero constant. Once legalized it is more
   3122   // difficult to determine (constant may be moved to a register).
   3123   // * Handle floating point arithmetic separately: they require Src1 to be
   3124   // legalized to a register.
   3125   switch (Instr->getOp()) {
   3126   default:
   3127     break;
   3128   case InstArithmetic::Udiv: {
   3129     constexpr bool NotRemainder = false;
   3130     Variable *Src0R = legalizeToReg(Src0);
   3131     lowerIDivRem(Dest, T, Src0R, Src1, &TargetARM32::_uxt, &TargetARM32::_udiv,
   3132                  NotRemainder);
   3133     return;
   3134   }
   3135   case InstArithmetic::Sdiv: {
   3136     constexpr bool NotRemainder = false;
   3137     Variable *Src0R = legalizeToReg(Src0);
   3138     lowerIDivRem(Dest, T, Src0R, Src1, &TargetARM32::_sxt, &TargetARM32::_sdiv,
   3139                  NotRemainder);
   3140     return;
   3141   }
   3142   case InstArithmetic::Urem: {
   3143     constexpr bool IsRemainder = true;
   3144     Variable *Src0R = legalizeToReg(Src0);
   3145     lowerIDivRem(Dest, T, Src0R, Src1, &TargetARM32::_uxt, &TargetARM32::_udiv,
   3146                  IsRemainder);
   3147     return;
   3148   }
   3149   case InstArithmetic::Srem: {
   3150     constexpr bool IsRemainder = true;
   3151     Variable *Src0R = legalizeToReg(Src0);
   3152     lowerIDivRem(Dest, T, Src0R, Src1, &TargetARM32::_sxt, &TargetARM32::_sdiv,
   3153                  IsRemainder);
   3154     return;
   3155   }
   3156   case InstArithmetic::Frem: {
   3157     if (!isScalarFloatingType(DestTy)) {
   3158       llvm::report_fatal_error("Unexpected type when lowering frem.");
   3159     }
   3160     llvm::report_fatal_error("Frem should have already been lowered.");
   3161   }
   3162   case InstArithmetic::Fadd: {
   3163     Variable *Src0R = legalizeToReg(Src0);
   3164     if (const Inst *Src1Producer = Computations.getProducerOf(Src1)) {
   3165       Variable *Src1R = legalizeToReg(Src1Producer->getSrc(0));
   3166       Variable *Src2R = legalizeToReg(Src1Producer->getSrc(1));
   3167       _vmla(Src0R, Src1R, Src2R);
   3168       _mov(Dest, Src0R);
   3169       return;
   3170     }
   3171 
   3172     Variable *Src1R = legalizeToReg(Src1);
   3173     _vadd(T, Src0R, Src1R);
   3174     _mov(Dest, T);
   3175     return;
   3176   }
   3177   case InstArithmetic::Fsub: {
   3178     Variable *Src0R = legalizeToReg(Src0);
   3179     if (const Inst *Src1Producer = Computations.getProducerOf(Src1)) {
   3180       Variable *Src1R = legalizeToReg(Src1Producer->getSrc(0));
   3181       Variable *Src2R = legalizeToReg(Src1Producer->getSrc(1));
   3182       _vmls(Src0R, Src1R, Src2R);
   3183       _mov(Dest, Src0R);
   3184       return;
   3185     }
   3186     Variable *Src1R = legalizeToReg(Src1);
   3187     _vsub(T, Src0R, Src1R);
   3188     _mov(Dest, T);
   3189     return;
   3190   }
   3191   case InstArithmetic::Fmul: {
   3192     Variable *Src0R = legalizeToReg(Src0);
   3193     Variable *Src1R = legalizeToReg(Src1);
   3194     _vmul(T, Src0R, Src1R);
   3195     _mov(Dest, T);
   3196     return;
   3197   }
   3198   case InstArithmetic::Fdiv: {
   3199     Variable *Src0R = legalizeToReg(Src0);
   3200     Variable *Src1R = legalizeToReg(Src1);
   3201     _vdiv(T, Src0R, Src1R);
   3202     _mov(Dest, T);
   3203     return;
   3204   }
   3205   }
   3206 
   3207   // Handle everything else here.
   3208   Int32Operands Srcs(Src0, Src1);
   3209   switch (Instr->getOp()) {
   3210   case InstArithmetic::_num:
   3211     llvm::report_fatal_error("Unknown arithmetic operator");
   3212     return;
   3213   case InstArithmetic::Add: {
   3214     if (const Inst *Src1Producer = Computations.getProducerOf(Src1)) {
   3215       assert(!isVectorType(DestTy));
   3216       Variable *Src0R = legalizeToReg(Src0);
   3217       Variable *Src1R = legalizeToReg(Src1Producer->getSrc(0));
   3218       Variable *Src2R = legalizeToReg(Src1Producer->getSrc(1));
   3219       _mla(T, Src1R, Src2R, Src0R);
   3220       _mov(Dest, T);
   3221       return;
   3222     }
   3223 
   3224     if (Srcs.hasConstOperand()) {
   3225       if (!Srcs.immediateIsFlexEncodable() &&
   3226           Srcs.negatedImmediateIsFlexEncodable()) {
   3227         assert(!isVectorType(DestTy));
   3228         Variable *Src0R = Srcs.src0R(this);
   3229         Operand *Src1F = Srcs.negatedSrc1F(this);
   3230         if (!Srcs.swappedOperands()) {
   3231           _sub(T, Src0R, Src1F);
   3232         } else {
   3233           _rsb(T, Src0R, Src1F);
   3234         }
   3235         _mov(Dest, T);
   3236         return;
   3237       }
   3238     }
   3239     Variable *Src0R = Srcs.src0R(this);
   3240     if (isVectorType(DestTy)) {
   3241       Variable *Src1R = legalizeToReg(Src1);
   3242       _vadd(T, Src0R, Src1R);
   3243     } else {
   3244       Operand *Src1RF = Srcs.src1RF(this);
   3245       _add(T, Src0R, Src1RF);
   3246     }
   3247     _mov(Dest, T);
   3248     return;
   3249   }
   3250   case InstArithmetic::And: {
   3251     if (Srcs.hasConstOperand()) {
   3252       if (!Srcs.immediateIsFlexEncodable() &&
   3253           Srcs.invertedImmediateIsFlexEncodable()) {
   3254         Variable *Src0R = Srcs.src0R(this);
   3255         Operand *Src1F = Srcs.invertedSrc1F(this);
   3256         _bic(T, Src0R, Src1F);
   3257         _mov(Dest, T);
   3258         return;
   3259       }
   3260     }
   3261     assert(isIntegerType(DestTy));
   3262     Variable *Src0R = Srcs.src0R(this);
   3263     if (isVectorType(DestTy)) {
   3264       Variable *Src1R = legalizeToReg(Src1);
   3265       _vand(T, Src0R, Src1R);
   3266     } else {
   3267       Operand *Src1RF = Srcs.src1RF(this);
   3268       _and(T, Src0R, Src1RF);
   3269     }
   3270     _mov(Dest, T);
   3271     return;
   3272   }
   3273   case InstArithmetic::Or: {
   3274     Variable *Src0R = Srcs.src0R(this);
   3275     assert(isIntegerType(DestTy));
   3276     if (isVectorType(DestTy)) {
   3277       Variable *Src1R = legalizeToReg(Src1);
   3278       _vorr(T, Src0R, Src1R);
   3279     } else {
   3280       Operand *Src1RF = Srcs.src1RF(this);
   3281       _orr(T, Src0R, Src1RF);
   3282     }
   3283     _mov(Dest, T);
   3284     return;
   3285   }
   3286   case InstArithmetic::Xor: {
   3287     Variable *Src0R = Srcs.src0R(this);
   3288     assert(isIntegerType(DestTy));
   3289     if (isVectorType(DestTy)) {
   3290       Variable *Src1R = legalizeToReg(Src1);
   3291       _veor(T, Src0R, Src1R);
   3292     } else {
   3293       Operand *Src1RF = Srcs.src1RF(this);
   3294       _eor(T, Src0R, Src1RF);
   3295     }
   3296     _mov(Dest, T);
   3297     return;
   3298   }
   3299   case InstArithmetic::Sub: {
   3300     if (const Inst *Src1Producer = Computations.getProducerOf(Src1)) {
   3301       assert(!isVectorType(DestTy));
   3302       Variable *Src0R = legalizeToReg(Src0);
   3303       Variable *Src1R = legalizeToReg(Src1Producer->getSrc(0));
   3304       Variable *Src2R = legalizeToReg(Src1Producer->getSrc(1));
   3305       _mls(T, Src1R, Src2R, Src0R);
   3306       _mov(Dest, T);
   3307       return;
   3308     }
   3309 
   3310     if (Srcs.hasConstOperand()) {
   3311       assert(!isVectorType(DestTy));
   3312       if (Srcs.immediateIsFlexEncodable()) {
   3313         Variable *Src0R = Srcs.src0R(this);
   3314         Operand *Src1RF = Srcs.src1RF(this);
   3315         if (Srcs.swappedOperands()) {
   3316           _rsb(T, Src0R, Src1RF);
   3317         } else {
   3318           _sub(T, Src0R, Src1RF);
   3319         }
   3320         _mov(Dest, T);
   3321         return;
   3322       }
   3323       if (!Srcs.swappedOperands() && Srcs.negatedImmediateIsFlexEncodable()) {
   3324         Variable *Src0R = Srcs.src0R(this);
   3325         Operand *Src1F = Srcs.negatedSrc1F(this);
   3326         _add(T, Src0R, Src1F);
   3327         _mov(Dest, T);
   3328         return;
   3329       }
   3330     }
   3331     Variable *Src0R = Srcs.unswappedSrc0R(this);
   3332     Variable *Src1R = Srcs.unswappedSrc1R(this);
   3333     if (isVectorType(DestTy)) {
   3334       _vsub(T, Src0R, Src1R);
   3335     } else {
   3336       _sub(T, Src0R, Src1R);
   3337     }
   3338     _mov(Dest, T);
   3339     return;
   3340   }
   3341   case InstArithmetic::Mul: {
   3342     const bool OptM1 = Func->getOptLevel() == Opt_m1;
   3343     if (!OptM1 && Srcs.hasConstOperand()) {
   3344       constexpr std::size_t MaxShifts = 4;
   3345       std::array<StrengthReduction::AggregationElement, MaxShifts> Shifts;
   3346       SizeT NumOperations;
   3347       int32_t Const = Srcs.getConstantValue();
   3348       const bool Invert = Const < 0;
   3349       const bool MultiplyByZero = Const == 0;
   3350       Operand *_0 =
   3351           legalize(Ctx->getConstantZero(DestTy), Legal_Reg | Legal_Flex);
   3352 
   3353       if (MultiplyByZero) {
   3354         _mov(T, _0);
   3355         _mov(Dest, T);
   3356         return;
   3357       }
   3358 
   3359       if (Invert) {
   3360         Const = -Const;
   3361       }
   3362 
   3363       if (StrengthReduction::tryToOptimize(Const, &NumOperations, &Shifts)) {
   3364         assert(NumOperations >= 1);
   3365         Variable *Src0R = Srcs.src0R(this);
   3366         int32_t Start;
   3367         int32_t End;
   3368         if (NumOperations == 1 || Shifts[NumOperations - 1].shAmt() != 0) {
   3369           // Multiplication by a power of 2 (NumOperations == 1); or
   3370           // Multiplication by a even number not a power of 2.
   3371           Start = 1;
   3372           End = NumOperations;
   3373           assert(Shifts[0].aggregateWithAdd());
   3374           _lsl(T, Src0R, shAmtImm(Shifts[0].shAmt()));
   3375         } else {
   3376           // Multiplication by an odd number. Put the free barrel shifter to a
   3377           // good use.
   3378           Start = 0;
   3379           End = NumOperations - 2;
   3380           const StrengthReduction::AggregationElement &Last =
   3381               Shifts[NumOperations - 1];
   3382           const StrengthReduction::AggregationElement &SecondToLast =
   3383               Shifts[NumOperations - 2];
   3384           if (!Last.aggregateWithAdd()) {
   3385             assert(SecondToLast.aggregateWithAdd());
   3386             _rsb(T, Src0R, SecondToLast.createShiftedOperand(Func, Src0R));
   3387           } else if (!SecondToLast.aggregateWithAdd()) {
   3388             assert(Last.aggregateWithAdd());
   3389             _sub(T, Src0R, SecondToLast.createShiftedOperand(Func, Src0R));
   3390           } else {
   3391             _add(T, Src0R, SecondToLast.createShiftedOperand(Func, Src0R));
   3392           }
   3393         }
   3394 
   3395         // Odd numbers :   S                                 E   I   I
   3396         //               +---+---+---+---+---+---+ ... +---+---+---+---+
   3397         //     Shifts  = |   |   |   |   |   |   | ... |   |   |   |   |
   3398         //               +---+---+---+---+---+---+ ... +---+---+---+---+
   3399         // Even numbers:   I   S                                     E
   3400         //
   3401         // S: Start; E: End; I: Init
   3402         for (int32_t I = Start; I < End; ++I) {
   3403           const StrengthReduction::AggregationElement &Current = Shifts[I];
   3404           Operand *SrcF = Current.createShiftedOperand(Func, Src0R);
   3405           if (Current.aggregateWithAdd()) {
   3406             _add(T, T, SrcF);
   3407           } else {
   3408             _sub(T, T, SrcF);
   3409           }
   3410         }
   3411 
   3412         if (Invert) {
   3413           // T = 0 - T.
   3414           _rsb(T, T, _0);
   3415         }
   3416 
   3417         _mov(Dest, T);
   3418         return;
   3419       }
   3420     }
   3421     Variable *Src0R = Srcs.unswappedSrc0R(this);
   3422     Variable *Src1R = Srcs.unswappedSrc1R(this);
   3423     if (isVectorType(DestTy)) {
   3424       _vmul(T, Src0R, Src1R);
   3425     } else {
   3426       _mul(T, Src0R, Src1R);
   3427     }
   3428     _mov(Dest, T);
   3429     return;
   3430   }
   3431   case InstArithmetic::Shl: {
   3432     Variable *Src0R = Srcs.unswappedSrc0R(this);
   3433     if (!isVectorType(T->getType())) {
   3434       if (Srcs.isSrc1ImmediateZero()) {
   3435         _mov(T, Src0R);
   3436       } else {
   3437         Operand *Src1R = Srcs.unswappedSrc1RShAmtImm(this);
   3438         _lsl(T, Src0R, Src1R);
   3439       }
   3440     } else {
   3441       if (Srcs.hasConstOperand()) {
   3442         ConstantInteger32 *ShAmt = llvm::cast<ConstantInteger32>(Srcs.src1());
   3443         _vshl(T, Src0R, ShAmt);
   3444       } else {
   3445         auto *Src1R = Srcs.unswappedSrc1R(this);
   3446         _vshl(T, Src0R, Src1R)->setSignType(InstARM32::FS_Unsigned);
   3447       }
   3448     }
   3449     _mov(Dest, T);
   3450     return;
   3451   }
   3452   case InstArithmetic::Lshr: {
   3453     Variable *Src0R = Srcs.unswappedSrc0R(this);
   3454     if (!isVectorType(T->getType())) {
   3455       if (DestTy != IceType_i32) {
   3456         _uxt(Src0R, Src0R);
   3457       }
   3458       if (Srcs.isSrc1ImmediateZero()) {
   3459         _mov(T, Src0R);
   3460       } else {
   3461         Operand *Src1R = Srcs.unswappedSrc1RShAmtImm(this);
   3462         _lsr(T, Src0R, Src1R);
   3463       }
   3464     } else {
   3465       if (Srcs.hasConstOperand()) {
   3466         ConstantInteger32 *ShAmt = llvm::cast<ConstantInteger32>(Srcs.src1());
   3467         _vshr(T, Src0R, ShAmt)->setSignType(InstARM32::FS_Unsigned);
   3468       } else {
   3469         auto *Src1R = Srcs.unswappedSrc1R(this);
   3470         auto *Src1RNeg = makeReg(Src1R->getType());
   3471         _vneg(Src1RNeg, Src1R);
   3472         _vshl(T, Src0R, Src1RNeg)->setSignType(InstARM32::FS_Unsigned);
   3473       }
   3474     }
   3475     _mov(Dest, T);
   3476     return;
   3477   }
   3478   case InstArithmetic::Ashr: {
   3479     Variable *Src0R = Srcs.unswappedSrc0R(this);
   3480     if (!isVectorType(T->getType())) {
   3481       if (DestTy != IceType_i32) {
   3482         _sxt(Src0R, Src0R);
   3483       }
   3484       if (Srcs.isSrc1ImmediateZero()) {
   3485         _mov(T, Src0R);
   3486       } else {
   3487         _asr(T, Src0R, Srcs.unswappedSrc1RShAmtImm(this));
   3488       }
   3489     } else {
   3490       if (Srcs.hasConstOperand()) {
   3491         ConstantInteger32 *ShAmt = llvm::cast<ConstantInteger32>(Srcs.src1());
   3492         _vshr(T, Src0R, ShAmt)->setSignType(InstARM32::FS_Signed);
   3493       } else {
   3494         auto *Src1R = Srcs.unswappedSrc1R(this);
   3495         auto *Src1RNeg = makeReg(Src1R->getType());
   3496         _vneg(Src1RNeg, Src1R);
   3497         _vshl(T, Src0R, Src1RNeg)->setSignType(InstARM32::FS_Signed);
   3498       }
   3499     }
   3500     _mov(Dest, T);
   3501     return;
   3502   }
   3503   case InstArithmetic::Udiv:
   3504   case InstArithmetic::Sdiv:
   3505   case InstArithmetic::Urem:
   3506   case InstArithmetic::Srem:
   3507     llvm::report_fatal_error(
   3508         "Integer div/rem should have been handled earlier.");
   3509     return;
   3510   case InstArithmetic::Fadd:
   3511   case InstArithmetic::Fsub:
   3512   case InstArithmetic::Fmul:
   3513   case InstArithmetic::Fdiv:
   3514   case InstArithmetic::Frem:
   3515     llvm::report_fatal_error(
   3516         "Floating point arith should have been handled earlier.");
   3517     return;
   3518   }
   3519 }
   3520 
   3521 void TargetARM32::lowerAssign(const InstAssign *Instr) {
   3522   Variable *Dest = Instr->getDest();
   3523 
   3524   if (Dest->isRematerializable()) {
   3525     Context.insert<InstFakeDef>(Dest);
   3526     return;
   3527   }
   3528 
   3529   Operand *Src0 = Instr->getSrc(0);
   3530   assert(Dest->getType() == Src0->getType());
   3531   if (Dest->getType() == IceType_i64) {
   3532     Src0 = legalizeUndef(Src0);
   3533 
   3534     Variable *T_Lo = makeReg(IceType_i32);
   3535     auto *DestLo = llvm::cast<Variable>(loOperand(Dest));
   3536     Operand *Src0Lo = legalize(loOperand(Src0), Legal_Reg | Legal_Flex);
   3537     _mov(T_Lo, Src0Lo);
   3538     _mov(DestLo, T_Lo);
   3539 
   3540     Variable *T_Hi = makeReg(IceType_i32);
   3541     auto *DestHi = llvm::cast<Variable>(hiOperand(Dest));
   3542     Operand *Src0Hi = legalize(hiOperand(Src0), Legal_Reg | Legal_Flex);
   3543     _mov(T_Hi, Src0Hi);
   3544     _mov(DestHi, T_Hi);
   3545 
   3546     return;
   3547   }
   3548 
   3549   Operand *NewSrc;
   3550   if (Dest->hasReg()) {
   3551     // If Dest already has a physical register, then legalize the Src operand
   3552     // into a Variable with the same register assignment. This especially
   3553     // helps allow the use of Flex operands.
   3554     NewSrc = legalize(Src0, Legal_Reg | Legal_Flex, Dest->getRegNum());
   3555   } else {
   3556     // Dest could be a stack operand. Since we could potentially need to do a
   3557     // Store (and store can only have Register operands), legalize this to a
   3558     // register.
   3559     NewSrc = legalize(Src0, Legal_Reg);
   3560   }
   3561 
   3562   if (isVectorType(Dest->getType()) || isScalarFloatingType(Dest->getType())) {
   3563     NewSrc = legalize(NewSrc, Legal_Reg | Legal_Mem);
   3564   }
   3565   _mov(Dest, NewSrc);
   3566 }
   3567 
   3568 TargetARM32::ShortCircuitCondAndLabel TargetARM32::lowerInt1ForBranch(
   3569     Operand *Boolean, const LowerInt1BranchTarget &TargetTrue,
   3570     const LowerInt1BranchTarget &TargetFalse, uint32_t ShortCircuitable) {
   3571   InstARM32Label *NewShortCircuitLabel = nullptr;
   3572   Operand *_1 = legalize(Ctx->getConstantInt1(1), Legal_Reg | Legal_Flex);
   3573 
   3574   const Inst *Producer = Computations.getProducerOf(Boolean);
   3575 
   3576   if (Producer == nullptr) {
   3577     // No producer, no problem: just do emit code to perform (Boolean & 1) and
   3578     // set the flags register. The branch should be taken if the resulting flags
   3579     // indicate a non-zero result.
   3580     _tst(legalizeToReg(Boolean), _1);
   3581     return ShortCircuitCondAndLabel(CondWhenTrue(CondARM32::NE));
   3582   }
   3583 
   3584   switch (Producer->getKind()) {
   3585   default:
   3586     llvm::report_fatal_error("Unexpected producer.");
   3587   case Inst::Icmp: {
   3588     return ShortCircuitCondAndLabel(
   3589         lowerIcmpCond(llvm::cast<InstIcmp>(Producer)));
   3590   } break;
   3591   case Inst::Fcmp: {
   3592     return ShortCircuitCondAndLabel(
   3593         lowerFcmpCond(llvm::cast<InstFcmp>(Producer)));
   3594   } break;
   3595   case Inst::Cast: {
   3596     const auto *CastProducer = llvm::cast<InstCast>(Producer);
   3597     assert(CastProducer->getCastKind() == InstCast::Trunc);
   3598     Operand *Src = CastProducer->getSrc(0);
   3599     if (Src->getType() == IceType_i64)
   3600       Src = loOperand(Src);
   3601     _tst(legalizeToReg(Src), _1);
   3602     return ShortCircuitCondAndLabel(CondWhenTrue(CondARM32::NE));
   3603   } break;
   3604   case Inst::Arithmetic: {
   3605     const auto *ArithProducer = llvm::cast<InstArithmetic>(Producer);
   3606     switch (ArithProducer->getOp()) {
   3607     default:
   3608       llvm::report_fatal_error("Unhandled Arithmetic Producer.");
   3609     case InstArithmetic::And: {
   3610       if (!(ShortCircuitable & SC_And)) {
   3611         NewShortCircuitLabel = InstARM32Label::create(Func, this);
   3612       }
   3613 
   3614       LowerInt1BranchTarget NewTarget =
   3615           TargetFalse.createForLabelOrDuplicate(NewShortCircuitLabel);
   3616 
   3617       ShortCircuitCondAndLabel CondAndLabel = lowerInt1ForBranch(
   3618           Producer->getSrc(0), TargetTrue, NewTarget, SC_And);
   3619       const CondWhenTrue &Cond = CondAndLabel.Cond;
   3620 
   3621       _br_short_circuit(NewTarget, Cond.invert());
   3622 
   3623       InstARM32Label *const ShortCircuitLabel = CondAndLabel.ShortCircuitTarget;
   3624       if (ShortCircuitLabel != nullptr)
   3625         Context.insert(ShortCircuitLabel);
   3626 
   3627       return ShortCircuitCondAndLabel(
   3628           lowerInt1ForBranch(Producer->getSrc(1), TargetTrue, NewTarget, SC_All)
   3629               .assertNoLabelAndReturnCond(),
   3630           NewShortCircuitLabel);
   3631     } break;
   3632     case InstArithmetic::Or: {
   3633       if (!(ShortCircuitable & SC_Or)) {
   3634         NewShortCircuitLabel = InstARM32Label::create(Func, this);
   3635       }
   3636 
   3637       LowerInt1BranchTarget NewTarget =
   3638           TargetTrue.createForLabelOrDuplicate(NewShortCircuitLabel);
   3639 
   3640       ShortCircuitCondAndLabel CondAndLabel = lowerInt1ForBranch(
   3641           Producer->getSrc(0), NewTarget, TargetFalse, SC_Or);
   3642       const CondWhenTrue &Cond = CondAndLabel.Cond;
   3643 
   3644       _br_short_circuit(NewTarget, Cond);
   3645 
   3646       InstARM32Label *const ShortCircuitLabel = CondAndLabel.ShortCircuitTarget;
   3647       if (ShortCircuitLabel != nullptr)
   3648         Context.insert(ShortCircuitLabel);
   3649 
   3650       return ShortCircuitCondAndLabel(lowerInt1ForBranch(Producer->getSrc(1),
   3651                                                          NewTarget, TargetFalse,
   3652                                                          SC_All)
   3653                                           .assertNoLabelAndReturnCond(),
   3654                                       NewShortCircuitLabel);
   3655     } break;
   3656     }
   3657   }
   3658   }
   3659 }
   3660 
   3661 void TargetARM32::lowerBr(const InstBr *Instr) {
   3662   if (Instr->isUnconditional()) {
   3663     _br(Instr->getTargetUnconditional());
   3664     return;
   3665   }
   3666 
   3667   CfgNode *TargetTrue = Instr->getTargetTrue();
   3668   CfgNode *TargetFalse = Instr->getTargetFalse();
   3669   ShortCircuitCondAndLabel CondAndLabel = lowerInt1ForBranch(
   3670       Instr->getCondition(), LowerInt1BranchTarget(TargetTrue),
   3671       LowerInt1BranchTarget(TargetFalse), SC_All);
   3672   assert(CondAndLabel.ShortCircuitTarget == nullptr);
   3673 
   3674   const CondWhenTrue &Cond = CondAndLabel.Cond;
   3675   if (Cond.WhenTrue1 != CondARM32::kNone) {
   3676     assert(Cond.WhenTrue0 != CondARM32::AL);
   3677     _br(TargetTrue, Cond.WhenTrue1);
   3678   }
   3679 
   3680   switch (Cond.WhenTrue0) {
   3681   default:
   3682     _br(TargetTrue, TargetFalse, Cond.WhenTrue0);
   3683     break;
   3684   case CondARM32::kNone:
   3685     _br(TargetFalse);
   3686     break;
   3687   case CondARM32::AL:
   3688     _br(TargetTrue);
   3689     break;
   3690   }
   3691 }
   3692 
   3693 void TargetARM32::lowerCall(const InstCall *Instr) {
   3694   Operand *CallTarget = Instr->getCallTarget();
   3695   if (Instr->isTargetHelperCall()) {
   3696     auto TargetHelperPreamble = ARM32HelpersPreamble.find(CallTarget);
   3697     if (TargetHelperPreamble != ARM32HelpersPreamble.end()) {
   3698       (this->*TargetHelperPreamble->second)(Instr);
   3699     }
   3700   }
   3701   MaybeLeafFunc = false;
   3702   NeedsStackAlignment = true;
   3703 
   3704   // Assign arguments to registers and stack. Also reserve stack.
   3705   TargetARM32::CallingConv CC;
   3706   // Pair of Arg Operand -> GPR number assignments.
   3707   llvm::SmallVector<std::pair<Operand *, RegNumT>, NumGPRArgs> GPRArgs;
   3708   llvm::SmallVector<std::pair<Operand *, RegNumT>, NumFP32Args> FPArgs;
   3709   // Pair of Arg Operand -> stack offset.
   3710   llvm::SmallVector<std::pair<Operand *, int32_t>, 8> StackArgs;
   3711   size_t ParameterAreaSizeBytes = 0;
   3712 
   3713   // Classify each argument operand according to the location where the
   3714   // argument is passed.
   3715   for (SizeT i = 0, NumArgs = Instr->getNumArgs(); i < NumArgs; ++i) {
   3716     Operand *Arg = legalizeUndef(Instr->getArg(i));
   3717     const Type Ty = Arg->getType();
   3718     bool InReg = false;
   3719     RegNumT Reg;
   3720     if (isScalarIntegerType(Ty)) {
   3721       InReg = CC.argInGPR(Ty, &Reg);
   3722     } else {
   3723       InReg = CC.argInVFP(Ty, &Reg);
   3724     }
   3725 
   3726     if (!InReg) {
   3727       ParameterAreaSizeBytes =
   3728           applyStackAlignmentTy(ParameterAreaSizeBytes, Ty);
   3729       StackArgs.push_back(std::make_pair(Arg, ParameterAreaSizeBytes));
   3730       ParameterAreaSizeBytes += typeWidthInBytesOnStack(Ty);
   3731       continue;
   3732     }
   3733 
   3734     if (Ty == IceType_i64) {
   3735       Operand *Lo = loOperand(Arg);
   3736       Operand *Hi = hiOperand(Arg);
   3737       GPRArgs.push_back(std::make_pair(
   3738           Lo, RegNumT::fixme(RegARM32::getI64PairFirstGPRNum(Reg))));
   3739       GPRArgs.push_back(std::make_pair(
   3740           Hi, RegNumT::fixme(RegARM32::getI64PairSecondGPRNum(Reg))));
   3741     } else if (isScalarIntegerType(Ty)) {
   3742       GPRArgs.push_back(std::make_pair(Arg, Reg));
   3743     } else {
   3744       FPArgs.push_back(std::make_pair(Arg, Reg));
   3745     }
   3746   }
   3747 
   3748   // Adjust the parameter area so that the stack is aligned. It is assumed that
   3749   // the stack is already aligned at the start of the calling sequence.
   3750   ParameterAreaSizeBytes = applyStackAlignment(ParameterAreaSizeBytes);
   3751 
   3752   if (ParameterAreaSizeBytes > MaxOutArgsSizeBytes) {
   3753     llvm::report_fatal_error("MaxOutArgsSizeBytes is not really a max.");
   3754   }
   3755 
   3756   // Copy arguments that are passed on the stack to the appropriate stack
   3757   // locations.
   3758   Variable *SP = getPhysicalRegister(RegARM32::Reg_sp);
   3759   for (auto &StackArg : StackArgs) {
   3760     ConstantInteger32 *Loc =
   3761         llvm::cast<ConstantInteger32>(Ctx->getConstantInt32(StackArg.second));
   3762     Type Ty = StackArg.first->getType();
   3763     OperandARM32Mem *Addr;
   3764     constexpr bool SignExt = false;
   3765     if (OperandARM32Mem::canHoldOffset(Ty, SignExt, StackArg.second)) {
   3766       Addr = OperandARM32Mem::create(Func, Ty, SP, Loc);
   3767     } else {
   3768       Variable *NewBase = Func->makeVariable(SP->getType());
   3769       lowerArithmetic(
   3770           InstArithmetic::create(Func, InstArithmetic::Add, NewBase, SP, Loc));
   3771       Addr = formMemoryOperand(NewBase, Ty);
   3772     }
   3773     lowerStore(InstStore::create(Func, StackArg.first, Addr));
   3774   }
   3775 
   3776   // Generate the call instruction. Assign its result to a temporary with high
   3777   // register allocation weight.
   3778   Variable *Dest = Instr->getDest();
   3779   // ReturnReg doubles as ReturnRegLo as necessary.
   3780   Variable *ReturnReg = nullptr;
   3781   Variable *ReturnRegHi = nullptr;
   3782   if (Dest) {
   3783     switch (Dest->getType()) {
   3784     case IceType_NUM:
   3785       llvm::report_fatal_error("Invalid Call dest type");
   3786       break;
   3787     case IceType_void:
   3788       break;
   3789     case IceType_i1:
   3790       assert(Computations.getProducerOf(Dest) == nullptr);
   3791     // Fall-through intended.
   3792     case IceType_i8:
   3793     case IceType_i16:
   3794     case IceType_i32:
   3795       ReturnReg = makeReg(Dest->getType(), RegARM32::Reg_r0);
   3796       break;
   3797     case IceType_i64:
   3798       ReturnReg = makeReg(IceType_i32, RegARM32::Reg_r0);
   3799       ReturnRegHi = makeReg(IceType_i32, RegARM32::Reg_r1);
   3800       break;
   3801     case IceType_f32:
   3802       ReturnReg = makeReg(Dest->getType(), RegARM32::Reg_s0);
   3803       break;
   3804     case IceType_f64:
   3805       ReturnReg = makeReg(Dest->getType(), RegARM32::Reg_d0);
   3806       break;
   3807     case IceType_v4i1:
   3808     case IceType_v8i1:
   3809     case IceType_v16i1:
   3810     case IceType_v16i8:
   3811     case IceType_v8i16:
   3812     case IceType_v4i32:
   3813     case IceType_v4f32:
   3814       ReturnReg = makeReg(Dest->getType(), RegARM32::Reg_q0);
   3815       break;
   3816     }
   3817   }
   3818 
   3819   // Allow ConstantRelocatable to be left alone as a direct call, but force
   3820   // other constants like ConstantInteger32 to be in a register and make it an
   3821   // indirect call.
   3822   if (!llvm::isa<ConstantRelocatable>(CallTarget)) {
   3823     CallTarget = legalize(CallTarget, Legal_Reg);
   3824   }
   3825 
   3826   // Copy arguments to be passed in registers to the appropriate registers.
   3827   CfgVector<Variable *> RegArgs;
   3828   for (auto &FPArg : FPArgs) {
   3829     RegArgs.emplace_back(legalizeToReg(FPArg.first, FPArg.second));
   3830   }
   3831   for (auto &GPRArg : GPRArgs) {
   3832     RegArgs.emplace_back(legalizeToReg(GPRArg.first, GPRArg.second));
   3833   }
   3834 
   3835   // Generate a FakeUse of register arguments so that they do not get dead code
   3836   // eliminated as a result of the FakeKill of scratch registers after the call.
   3837   // These fake-uses need to be placed here to avoid argument registers from
   3838   // being used during the legalizeToReg() calls above.
   3839   for (auto *RegArg : RegArgs) {
   3840     Context.insert<InstFakeUse>(RegArg);
   3841   }
   3842 
   3843   InstARM32Call *NewCall =
   3844       Sandboxer(this, InstBundleLock::Opt_AlignToEnd).bl(ReturnReg, CallTarget);
   3845 
   3846   if (ReturnRegHi)
   3847     Context.insert<InstFakeDef>(ReturnRegHi);
   3848 
   3849   // Insert a register-kill pseudo instruction.
   3850   Context.insert<InstFakeKill>(NewCall);
   3851 
   3852   // Generate a FakeUse to keep the call live if necessary.
   3853   if (Instr->hasSideEffects() && ReturnReg) {
   3854     Context.insert<InstFakeUse>(ReturnReg);
   3855   }
   3856 
   3857   if (Dest != nullptr) {
   3858     // Assign the result of the call to Dest.
   3859     if (ReturnReg != nullptr) {
   3860       if (ReturnRegHi) {
   3861         auto *Dest64On32 = llvm::cast<Variable64On32>(Dest);
   3862         Variable *DestLo = Dest64On32->getLo();
   3863         Variable *DestHi = Dest64On32->getHi();
   3864         _mov(DestLo, ReturnReg);
   3865         _mov(DestHi, ReturnRegHi);
   3866       } else {
   3867         if (isFloatingType(Dest->getType()) || isVectorType(Dest->getType())) {
   3868           _mov(Dest, ReturnReg);
   3869         } else {
   3870           assert(isIntegerType(Dest->getType()) &&
   3871                  typeWidthInBytes(Dest->getType()) <= 4);
   3872           _mov(Dest, ReturnReg);
   3873         }
   3874       }
   3875     }
   3876   }
   3877 
   3878   if (Instr->isTargetHelperCall()) {
   3879     auto TargetHelpersPostamble = ARM32HelpersPostamble.find(CallTarget);
   3880     if (TargetHelpersPostamble != ARM32HelpersPostamble.end()) {
   3881       (this->*TargetHelpersPostamble->second)(Instr);
   3882     }
   3883   }
   3884 }
   3885 
   3886 namespace {
   3887 void configureBitcastTemporary(Variable64On32 *Var) {
   3888   Var->setMustNotHaveReg();
   3889   Var->getHi()->setMustHaveReg();
   3890   Var->getLo()->setMustHaveReg();
   3891 }
   3892 } // end of anonymous namespace
   3893 
   3894 void TargetARM32::lowerCast(const InstCast *Instr) {
   3895   InstCast::OpKind CastKind = Instr->getCastKind();
   3896   Variable *Dest = Instr->getDest();
   3897   const Type DestTy = Dest->getType();
   3898   Operand *Src0 = legalizeUndef(Instr->getSrc(0));
   3899   switch (CastKind) {
   3900   default:
   3901     Func->setError("Cast type not supported");
   3902     return;
   3903   case InstCast::Sext: {
   3904     if (isVectorType(DestTy)) {
   3905       Variable *T0 = makeReg(DestTy);
   3906       Variable *T1 = makeReg(DestTy);
   3907       ConstantInteger32 *ShAmt = nullptr;
   3908       switch (DestTy) {
   3909       default:
   3910         llvm::report_fatal_error("Unexpected type in vector sext.");
   3911       case IceType_v16i8:
   3912         ShAmt = llvm::cast<ConstantInteger32>(Ctx->getConstantInt32(7));
   3913         break;
   3914       case IceType_v8i16:
   3915         ShAmt = llvm::cast<ConstantInteger32>(Ctx->getConstantInt32(15));
   3916         break;
   3917       case IceType_v4i32:
   3918         ShAmt = llvm::cast<ConstantInteger32>(Ctx->getConstantInt32(31));
   3919         break;
   3920       }
   3921       auto *Src0R = legalizeToReg(Src0);
   3922       _vshl(T0, Src0R, ShAmt);
   3923       _vshr(T1, T0, ShAmt)->setSignType(InstARM32::FS_Signed);
   3924       _mov(Dest, T1);
   3925     } else if (DestTy == IceType_i64) {
   3926       // t1=sxtb src; t2= mov t1 asr #31; dst.lo=t1; dst.hi=t2
   3927       Constant *ShiftAmt = Ctx->getConstantInt32(31);
   3928       auto *DestLo = llvm::cast<Variable>(loOperand(Dest));
   3929       auto *DestHi = llvm::cast<Variable>(hiOperand(Dest));
   3930       Variable *T_Lo = makeReg(DestLo->getType());
   3931       if (Src0->getType() == IceType_i32) {
   3932         Operand *Src0RF = legalize(Src0, Legal_Reg | Legal_Flex);
   3933         _mov(T_Lo, Src0RF);
   3934       } else if (Src0->getType() != IceType_i1) {
   3935         Variable *Src0R = legalizeToReg(Src0);
   3936         _sxt(T_Lo, Src0R);
   3937       } else {
   3938         Operand *_0 = Ctx->getConstantZero(IceType_i32);
   3939         Operand *_m1 = Ctx->getConstantInt32(-1);
   3940         lowerInt1ForSelect(T_Lo, Src0, _m1, _0);
   3941       }
   3942       _mov(DestLo, T_Lo);
   3943       Variable *T_Hi = makeReg(DestHi->getType());
   3944       if (Src0->getType() != IceType_i1) {
   3945         _mov(T_Hi, OperandARM32FlexReg::create(Func, IceType_i32, T_Lo,
   3946                                                OperandARM32::ASR, ShiftAmt));
   3947       } else {
   3948         // For i1, the asr instruction is already done above.
   3949         _mov(T_Hi, T_Lo);
   3950       }
   3951       _mov(DestHi, T_Hi);
   3952     } else if (Src0->getType() != IceType_i1) {
   3953       // t1 = sxt src; dst = t1
   3954       Variable *Src0R = legalizeToReg(Src0);
   3955       Variable *T = makeReg(DestTy);
   3956       _sxt(T, Src0R);
   3957       _mov(Dest, T);
   3958     } else {
   3959       Constant *_0 = Ctx->getConstantZero(IceType_i32);
   3960       Operand *_m1 = Ctx->getConstantInt(DestTy, -1);
   3961       Variable *T = makeReg(DestTy);
   3962       lowerInt1ForSelect(T, Src0, _m1, _0);
   3963       _mov(Dest, T);
   3964     }
   3965     break;
   3966   }
   3967   case InstCast::Zext: {
   3968     if (isVectorType(DestTy)) {
   3969       auto *Mask = makeReg(DestTy);
   3970       auto *_1 = Ctx->getConstantInt32(1);
   3971       auto *T = makeReg(DestTy);
   3972       auto *Src0R = legalizeToReg(Src0);
   3973       _mov(Mask, _1);
   3974       _vand(T, Src0R, Mask);
   3975       _mov(Dest, T);
   3976     } else if (DestTy == IceType_i64) {
   3977       // t1=uxtb src; dst.lo=t1; dst.hi=0
   3978       Operand *_0 =
   3979           legalize(Ctx->getConstantZero(IceType_i32), Legal_Reg | Legal_Flex);
   3980       auto *DestLo = llvm::cast<Variable>(loOperand(Dest));
   3981       auto *DestHi = llvm::cast<Variable>(hiOperand(Dest));
   3982       Variable *T_Lo = makeReg(DestLo->getType());
   3983 
   3984       switch (Src0->getType()) {
   3985       default: {
   3986         assert(Src0->getType() != IceType_i64);
   3987         _uxt(T_Lo, legalizeToReg(Src0));
   3988       } break;
   3989       case IceType_i32: {
   3990         _mov(T_Lo, legalize(Src0, Legal_Reg | Legal_Flex));
   3991       } break;
   3992       case IceType_i1: {
   3993         SafeBoolChain Safe = lowerInt1(T_Lo, Src0);
   3994         if (Safe == SBC_No) {
   3995           Operand *_1 =
   3996               legalize(Ctx->getConstantInt1(1), Legal_Reg | Legal_Flex);
   3997           _and(T_Lo, T_Lo, _1);
   3998         }
   3999       } break;
   4000       }
   4001 
   4002       _mov(DestLo, T_Lo);
   4003 
   4004       Variable *T_Hi = makeReg(DestLo->getType());
   4005       _mov(T_Hi, _0);
   4006       _mov(DestHi, T_Hi);
   4007     } else if (Src0->getType() == IceType_i1) {
   4008       Variable *T = makeReg(DestTy);
   4009 
   4010       SafeBoolChain Safe = lowerInt1(T, Src0);
   4011       if (Safe == SBC_No) {
   4012         Operand *_1 = legalize(Ctx->getConstantInt1(1), Legal_Reg | Legal_Flex);
   4013         _and(T, T, _1);
   4014       }
   4015 
   4016       _mov(Dest, T);
   4017     } else {
   4018       // t1 = uxt src; dst = t1
   4019       Variable *Src0R = legalizeToReg(Src0);
   4020       Variable *T = makeReg(DestTy);
   4021       _uxt(T, Src0R);
   4022       _mov(Dest, T);
   4023     }
   4024     break;
   4025   }
   4026   case InstCast::Trunc: {
   4027     if (isVectorType(DestTy)) {
   4028       auto *T = makeReg(DestTy);
   4029       auto *Src0R = legalizeToReg(Src0);
   4030       _mov(T, Src0R);
   4031       _mov(Dest, T);
   4032     } else {
   4033       if (Src0->getType() == IceType_i64)
   4034         Src0 = loOperand(Src0);
   4035       Operand *Src0RF = legalize(Src0, Legal_Reg | Legal_Flex);
   4036       // t1 = trunc Src0RF; Dest = t1
   4037       Variable *T = makeReg(DestTy);
   4038       _mov(T, Src0RF);
   4039       if (DestTy == IceType_i1)
   4040         _and(T, T, Ctx->getConstantInt1(1));
   4041       _mov(Dest, T);
   4042     }
   4043     break;
   4044   }
   4045   case InstCast::Fptrunc:
   4046   case InstCast::Fpext: {
   4047     // fptrunc: dest.f32 = fptrunc src0.fp64
   4048     // fpext: dest.f64 = fptrunc src0.fp32
   4049     const bool IsTrunc = CastKind == InstCast::Fptrunc;
   4050     assert(!isVectorType(DestTy));
   4051     assert(DestTy == (IsTrunc ? IceType_f32 : IceType_f64));
   4052     assert(Src0->getType() == (IsTrunc ? IceType_f64 : IceType_f32));
   4053     Variable *Src0R = legalizeToReg(Src0);
   4054     Variable *T = makeReg(DestTy);
   4055     _vcvt(T, Src0R, IsTrunc ? InstARM32Vcvt::D2s : InstARM32Vcvt::S2d);
   4056     _mov(Dest, T);
   4057     break;
   4058   }
   4059   case InstCast::Fptosi:
   4060   case InstCast::Fptoui: {
   4061     const bool DestIsSigned = CastKind == InstCast::Fptosi;
   4062     Variable *Src0R = legalizeToReg(Src0);
   4063 
   4064     if (isVectorType(DestTy)) {
   4065       assert(typeElementType(Src0->getType()) == IceType_f32);
   4066       auto *T = makeReg(DestTy);
   4067       _vcvt(T, Src0R,
   4068             DestIsSigned ? InstARM32Vcvt::Vs2si : InstARM32Vcvt::Vs2ui);
   4069       _mov(Dest, T);
   4070       break;
   4071     }
   4072 
   4073     const bool Src0IsF32 = isFloat32Asserting32Or64(Src0->getType());
   4074     if (llvm::isa<Variable64On32>(Dest)) {
   4075       llvm::report_fatal_error("fp-to-i64 should have been pre-lowered.");
   4076     }
   4077     // fptosi:
   4078     //     t1.fp = vcvt src0.fp
   4079     //     t2.i32 = vmov t1.fp
   4080     //     dest.int = conv t2.i32     @ Truncates the result if needed.
   4081     // fptoui:
   4082     //     t1.fp = vcvt src0.fp
   4083     //     t2.u32 = vmov t1.fp
   4084     //     dest.uint = conv t2.u32    @ Truncates the result if needed.
   4085     Variable *T_fp = makeReg(IceType_f32);
   4086     const InstARM32Vcvt::VcvtVariant Conversion =
   4087         Src0IsF32 ? (DestIsSigned ? InstARM32Vcvt::S2si : InstARM32Vcvt::S2ui)
   4088                   : (DestIsSigned ? InstARM32Vcvt::D2si : InstARM32Vcvt::D2ui);
   4089     _vcvt(T_fp, Src0R, Conversion);
   4090     Variable *T = makeReg(IceType_i32);
   4091     _mov(T, T_fp);
   4092     if (DestTy != IceType_i32) {
   4093       Variable *T_1 = makeReg(DestTy);
   4094       lowerCast(InstCast::create(Func, InstCast::Trunc, T_1, T));
   4095       T = T_1;
   4096     }
   4097     _mov(Dest, T);
   4098     break;
   4099   }
   4100   case InstCast::Sitofp:
   4101   case InstCast::Uitofp: {
   4102     const bool SourceIsSigned = CastKind == InstCast::Sitofp;
   4103 
   4104     if (isVectorType(DestTy)) {
   4105       assert(typeElementType(DestTy) == IceType_f32);
   4106       auto *T = makeReg(DestTy);
   4107       Variable *Src0R = legalizeToReg(Src0);
   4108       _vcvt(T, Src0R,
   4109             SourceIsSigned ? InstARM32Vcvt::Vsi2s : InstARM32Vcvt::Vui2s);
   4110       _mov(Dest, T);
   4111       break;
   4112     }
   4113 
   4114     const bool DestIsF32 = isFloat32Asserting32Or64(DestTy);
   4115     if (Src0->getType() == IceType_i64) {
   4116       llvm::report_fatal_error("i64-to-fp should have been pre-lowered.");
   4117     }
   4118     // sitofp:
   4119     //     t1.i32 = sext src.int    @ sign-extends src0 if needed.
   4120     //     t2.fp32 = vmov t1.i32
   4121     //     t3.fp = vcvt.{fp}.s32    @ fp is either f32 or f64
   4122     // uitofp:
   4123     //     t1.i32 = zext src.int    @ zero-extends src0 if needed.
   4124     //     t2.fp32 = vmov t1.i32
   4125     //     t3.fp = vcvt.{fp}.s32    @ fp is either f32 or f64
   4126     if (Src0->getType() != IceType_i32) {
   4127       Variable *Src0R_32 = makeReg(IceType_i32);
   4128       lowerCast(InstCast::create(Func, SourceIsSigned ? InstCast::Sext
   4129                                                       : InstCast::Zext,
   4130                                  Src0R_32, Src0));
   4131       Src0 = Src0R_32;
   4132     }
   4133     Variable *Src0R = legalizeToReg(Src0);
   4134     Variable *Src0R_f32 = makeReg(IceType_f32);
   4135     _mov(Src0R_f32, Src0R);
   4136     Src0R = Src0R_f32;
   4137     Variable *T = makeReg(DestTy);
   4138     const InstARM32Vcvt::VcvtVariant Conversion =
   4139         DestIsF32
   4140             ? (SourceIsSigned ? InstARM32Vcvt::Si2s : InstARM32Vcvt::Ui2s)
   4141             : (SourceIsSigned ? InstARM32Vcvt::Si2d : InstARM32Vcvt::Ui2d);
   4142     _vcvt(T, Src0R, Conversion);
   4143     _mov(Dest, T);
   4144     break;
   4145   }
   4146   case InstCast::Bitcast: {
   4147     Operand *Src0 = Instr->getSrc(0);
   4148     if (DestTy == Src0->getType()) {
   4149       auto *Assign = InstAssign::create(Func, Dest, Src0);
   4150       lowerAssign(Assign);
   4151       return;
   4152     }
   4153     switch (DestTy) {
   4154     case IceType_NUM:
   4155     case IceType_void:
   4156       llvm::report_fatal_error("Unexpected bitcast.");
   4157     case IceType_i1:
   4158       UnimplementedLoweringError(this, Instr);
   4159       break;
   4160     case IceType_i8:
   4161       assert(Src0->getType() == IceType_v8i1);
   4162       llvm::report_fatal_error(
   4163           "i8 to v8i1 conversion should have been prelowered.");
   4164       break;
   4165     case IceType_i16:
   4166       assert(Src0->getType() == IceType_v16i1);
   4167       llvm::report_fatal_error(
   4168           "i16 to v16i1 conversion should have been prelowered.");
   4169       break;
   4170     case IceType_i32:
   4171     case IceType_f32: {
   4172       Variable *Src0R = legalizeToReg(Src0);
   4173       Variable *T = makeReg(DestTy);
   4174       _mov(T, Src0R);
   4175       lowerAssign(InstAssign::create(Func, Dest, T));
   4176       break;
   4177     }
   4178     case IceType_i64: {
   4179       // t0, t1 <- src0
   4180       // dest[31..0]  = t0
   4181       // dest[63..32] = t1
   4182       assert(Src0->getType() == IceType_f64);
   4183       auto *T = llvm::cast<Variable64On32>(Func->makeVariable(IceType_i64));
   4184       T->initHiLo(Func);
   4185       configureBitcastTemporary(T);
   4186       Variable *Src0R = legalizeToReg(Src0);
   4187       _mov(T, Src0R);
   4188       Context.insert<InstFakeUse>(T->getHi());
   4189       Context.insert<InstFakeUse>(T->getLo());
   4190       lowerAssign(InstAssign::create(Func, Dest, T));
   4191       break;
   4192     }
   4193     case IceType_f64: {
   4194       // T0 <- lo(src)
   4195       // T1 <- hi(src)
   4196       // vmov T2, T0, T1
   4197       // Dest <- T2
   4198       assert(Src0->getType() == IceType_i64);
   4199       Variable *T = makeReg(DestTy);
   4200       auto *Src64 = llvm::cast<Variable64On32>(Func->makeVariable(IceType_i64));
   4201       Src64->initHiLo(Func);
   4202       configureBitcastTemporary(Src64);
   4203       lowerAssign(InstAssign::create(Func, Src64, Src0));
   4204       _mov(T, Src64);
   4205       lowerAssign(InstAssign::create(Func, Dest, T));
   4206       break;
   4207     }
   4208     case IceType_v8i1:
   4209       assert(Src0->getType() == IceType_i8);
   4210       llvm::report_fatal_error(
   4211           "v8i1 to i8 conversion should have been prelowered.");
   4212       break;
   4213     case IceType_v16i1:
   4214       assert(Src0->getType() == IceType_i16);
   4215       llvm::report_fatal_error(
   4216           "v16i1 to i16 conversion should have been prelowered.");
   4217       break;
   4218     case IceType_v4i1:
   4219     case IceType_v8i16:
   4220     case IceType_v16i8:
   4221     case IceType_v4f32:
   4222     case IceType_v4i32: {
   4223       assert(typeWidthInBytes(DestTy) == typeWidthInBytes(Src0->getType()));
   4224       assert(isVectorType(DestTy) == isVectorType(Src0->getType()));
   4225       Variable *T = makeReg(DestTy);
   4226       _mov(T, Src0);
   4227       _mov(Dest, T);
   4228       break;
   4229     }
   4230     }
   4231     break;
   4232   }
   4233   }
   4234 }
   4235 
   4236 void TargetARM32::lowerExtractElement(const InstExtractElement *Instr) {
   4237   Variable *Dest = Instr->getDest();
   4238   Type DestTy = Dest->getType();
   4239 
   4240   Variable *Src0 = legalizeToReg(Instr->getSrc(0));
   4241   Operand *Src1 = Instr->getSrc(1);
   4242 
   4243   if (const auto *Imm = llvm::dyn_cast<ConstantInteger32>(Src1)) {
   4244     const uint32_t Index = Imm->getValue();
   4245     Variable *T = makeReg(DestTy);
   4246     Variable *TSrc0 = makeReg(Src0->getType());
   4247 
   4248     if (isFloatingType(DestTy)) {
   4249       // We need to make sure the source is in a suitable register.
   4250       TSrc0->setRegClass(RegARM32::RCARM32_QtoS);
   4251     }
   4252 
   4253     _mov(TSrc0, Src0);
   4254     _extractelement(T, TSrc0, Index);
   4255     _mov(Dest, T);
   4256     return;
   4257   }
   4258   assert(false && "extractelement requires a constant index");
   4259 }
   4260 
   4261 namespace {
   4262 // Validates FCMPARM32_TABLE's declaration w.r.t. InstFcmp::FCondition ordering
   4263 // (and naming).
   4264 enum {
   4265 #define X(val, CC0, CC1, CC0_V, CC1_V, INV_V, NEG_V) _fcmp_ll_##val,
   4266   FCMPARM32_TABLE
   4267 #undef X
   4268       _fcmp_ll_NUM
   4269 };
   4270 
   4271 enum {
   4272 #define X(tag, str) _fcmp_hl_##tag = InstFcmp::tag,
   4273   ICEINSTFCMP_TABLE
   4274 #undef X
   4275       _fcmp_hl_NUM
   4276 };
   4277 
   4278 static_assert((uint32_t)_fcmp_hl_NUM == (uint32_t)_fcmp_ll_NUM,
   4279               "Inconsistency between high-level and low-level fcmp tags.");
   4280 #define X(tag, str)                                                            \
   4281   static_assert(                                                               \
   4282       (uint32_t)_fcmp_hl_##tag == (uint32_t)_fcmp_ll_##tag,                    \
   4283       "Inconsistency between high-level and low-level fcmp tag " #tag);
   4284 ICEINSTFCMP_TABLE
   4285 #undef X
   4286 
   4287 struct {
   4288   CondARM32::Cond CC0;
   4289   CondARM32::Cond CC1;
   4290 } TableFcmp[] = {
   4291 #define X(val, CC0, CC1, CC0_V, CC1_V, INV_V, NEG_V)                           \
   4292   { CondARM32::CC0, CondARM32::CC1 }                                           \
   4293   ,
   4294     FCMPARM32_TABLE
   4295 #undef X
   4296 };
   4297 
   4298 bool isFloatingPointZero(const Operand *Src) {
   4299   if (const auto *F32 = llvm::dyn_cast<const ConstantFloat>(Src)) {
   4300     return Utils::isPositiveZero(F32->getValue());
   4301   }
   4302 
   4303   if (const auto *F64 = llvm::dyn_cast<const ConstantDouble>(Src)) {
   4304     return Utils::isPositiveZero(F64->getValue());
   4305   }
   4306 
   4307   return false;
   4308 }
   4309 } // end of anonymous namespace
   4310 
   4311 TargetARM32::CondWhenTrue TargetARM32::lowerFcmpCond(const InstFcmp *Instr) {
   4312   InstFcmp::FCond Condition = Instr->getCondition();
   4313   switch (Condition) {
   4314   case InstFcmp::False:
   4315     return CondWhenTrue(CondARM32::kNone);
   4316   case InstFcmp::True:
   4317     return CondWhenTrue(CondARM32::AL);
   4318     break;
   4319   default: {
   4320     Variable *Src0R = legalizeToReg(Instr->getSrc(0));
   4321     Operand *Src1 = Instr->getSrc(1);
   4322     if (isFloatingPointZero(Src1)) {
   4323       _vcmp(Src0R, OperandARM32FlexFpZero::create(Func, Src0R->getType()));
   4324     } else {
   4325       _vcmp(Src0R, legalizeToReg(Src1));
   4326     }
   4327     _vmrs();
   4328     assert(Condition < llvm::array_lengthof(TableFcmp));
   4329     return CondWhenTrue(TableFcmp[Condition].CC0, TableFcmp[Condition].CC1);
   4330   }
   4331   }
   4332 }
   4333 
   4334 void TargetARM32::lowerFcmp(const InstFcmp *Instr) {
   4335   Variable *Dest = Instr->getDest();
   4336   const Type DestTy = Dest->getType();
   4337 
   4338   if (isVectorType(DestTy)) {
   4339     if (Instr->getCondition() == InstFcmp::False) {
   4340       constexpr Type SafeTypeForMovingConstant = IceType_v4i32;
   4341       auto *T = makeReg(SafeTypeForMovingConstant);
   4342       _mov(T, llvm::cast<ConstantInteger32>(Ctx->getConstantInt32(0)));
   4343       _mov(Dest, T);
   4344       return;
   4345     }
   4346 
   4347     if (Instr->getCondition() == InstFcmp::True) {
   4348       constexpr Type SafeTypeForMovingConstant = IceType_v4i32;
   4349       auto *T = makeReg(SafeTypeForMovingConstant);
   4350       _mov(T, llvm::cast<ConstantInteger32>(Ctx->getConstantInt32(1)));
   4351       _mov(Dest, T);
   4352       return;
   4353     }
   4354 
   4355     Variable *T0;
   4356     Variable *T1;
   4357     bool Negate = false;
   4358     auto *Src0 = legalizeToReg(Instr->getSrc(0));
   4359     auto *Src1 = legalizeToReg(Instr->getSrc(1));
   4360 
   4361     switch (Instr->getCondition()) {
   4362     default:
   4363       llvm::report_fatal_error("Unhandled fp comparison.");
   4364 #define _Vcnone(Tptr, S0, S1)                                                  \
   4365   do {                                                                         \
   4366     *(Tptr) = nullptr;                                                         \
   4367   } while (0)
   4368 #define _Vceq(Tptr, S0, S1)                                                    \
   4369   do {                                                                         \
   4370     *(Tptr) = makeReg(DestTy);                                                 \
   4371     _vceq(*(Tptr), S0, S1);                                                    \
   4372   } while (0)
   4373 #define _Vcge(Tptr, S0, S1)                                                    \
   4374   do {                                                                         \
   4375     *(Tptr) = makeReg(DestTy);                                                 \
   4376     _vcge(*(Tptr), S0, S1)->setSignType(InstARM32::FS_Signed);                 \
   4377   } while (0)
   4378 #define _Vcgt(Tptr, S0, S1)                                                    \
   4379   do {                                                                         \
   4380     *(Tptr) = makeReg(DestTy);                                                 \
   4381     _vcgt(*(Tptr), S0, S1)->setSignType(InstARM32::FS_Signed);                 \
   4382   } while (0)
   4383 #define X(val, CC0, CC1, CC0_V, CC1_V, INV_V, NEG_V)                           \
   4384   case InstFcmp::val: {                                                        \
   4385     _Vc##CC0_V(&T0, (INV_V) ? Src1 : Src0, (INV_V) ? Src0 : Src1);             \
   4386     _Vc##CC1_V(&T1, (INV_V) ? Src0 : Src1, (INV_V) ? Src1 : Src0);             \
   4387     Negate = NEG_V;                                                            \
   4388   } break;
   4389       FCMPARM32_TABLE
   4390 #undef X
   4391 #undef _Vcgt
   4392 #undef _Vcge
   4393 #undef _Vceq
   4394 #undef _Vcnone
   4395     }
   4396     assert(T0 != nullptr);
   4397     Variable *T = T0;
   4398     if (T1 != nullptr) {
   4399       T = makeReg(DestTy);
   4400       _vorr(T, T0, T1);
   4401     }
   4402 
   4403     if (Negate) {
   4404       auto *TNeg = makeReg(DestTy);
   4405       _vmvn(TNeg, T);
   4406       T = TNeg;
   4407     }
   4408 
   4409     _mov(Dest, T);
   4410     return;
   4411   }
   4412 
   4413   Variable *T = makeReg(IceType_i1);
   4414   Operand *_1 = legalize(Ctx->getConstantInt32(1), Legal_Reg | Legal_Flex);
   4415   Operand *_0 =
   4416       legalize(Ctx->getConstantZero(IceType_i32), Legal_Reg | Legal_Flex);
   4417 
   4418   CondWhenTrue Cond = lowerFcmpCond(Instr);
   4419 
   4420   bool RedefineT = false;
   4421   if (Cond.WhenTrue0 != CondARM32::AL) {
   4422     _mov(T, _0);
   4423     RedefineT = true;
   4424   }
   4425 
   4426   if (Cond.WhenTrue0 == CondARM32::kNone) {
   4427     _mov(Dest, T);
   4428     return;
   4429   }
   4430 
   4431   if (RedefineT) {
   4432     _mov_redefined(T, _1, Cond.WhenTrue0);
   4433   } else {
   4434     _mov(T, _1, Cond.WhenTrue0);
   4435   }
   4436 
   4437   if (Cond.WhenTrue1 != CondARM32::kNone) {
   4438     _mov_redefined(T, _1, Cond.WhenTrue1);
   4439   }
   4440 
   4441   _mov(Dest, T);
   4442 }
   4443 
   4444 TargetARM32::CondWhenTrue
   4445 TargetARM32::lowerInt64IcmpCond(InstIcmp::ICond Condition, Operand *Src0,
   4446                                 Operand *Src1) {
   4447   assert(Condition < llvm::array_lengthof(TableIcmp64));
   4448 
   4449   Int32Operands SrcsLo(loOperand(Src0), loOperand(Src1));
   4450   Int32Operands SrcsHi(hiOperand(Src0), hiOperand(Src1));
   4451   assert(SrcsLo.hasConstOperand() == SrcsHi.hasConstOperand());
   4452   assert(SrcsLo.swappedOperands() == SrcsHi.swappedOperands());
   4453 
   4454   if (SrcsLo.hasConstOperand()) {
   4455     const uint32_t ValueLo = SrcsLo.getConstantValue();
   4456     const uint32_t ValueHi = SrcsHi.getConstantValue();
   4457     const uint64_t Value = (static_cast<uint64_t>(ValueHi) << 32) | ValueLo;
   4458     if ((Condition == InstIcmp::Eq || Condition == InstIcmp::Ne) &&
   4459         Value == 0) {
   4460       Variable *T = makeReg(IceType_i32);
   4461       Variable *Src0LoR = SrcsLo.src0R(this);
   4462       Variable *Src0HiR = SrcsHi.src0R(this);
   4463       _orrs(T, Src0LoR, Src0HiR);
   4464       Context.insert<InstFakeUse>(T);
   4465       return CondWhenTrue(TableIcmp64[Condition].C1);
   4466     }
   4467 
   4468     Variable *Src0RLo = SrcsLo.src0R(this);
   4469     Variable *Src0RHi = SrcsHi.src0R(this);
   4470     Operand *Src1RFLo = SrcsLo.src1RF(this);
   4471     Operand *Src1RFHi = ValueLo == ValueHi ? Src1RFLo : SrcsHi.src1RF(this);
   4472 
   4473     const bool UseRsb =
   4474         TableIcmp64[Condition].Swapped != SrcsLo.swappedOperands();
   4475 
   4476     if (UseRsb) {
   4477       if (TableIcmp64[Condition].IsSigned) {
   4478         Variable *T = makeReg(IceType_i32);
   4479         _rsbs(T, Src0RLo, Src1RFLo);
   4480         Context.insert<InstFakeUse>(T);
   4481 
   4482         T = makeReg(IceType_i32);
   4483         _rscs(T, Src0RHi, Src1RFHi);
   4484         // We need to add a FakeUse here because liveness gets mad at us (Def
   4485         // without Use.) Note that flag-setting instructions are considered to
   4486         // have side effects and, therefore, are not DCE'ed.
   4487         Context.insert<InstFakeUse>(T);
   4488       } else {
   4489         Variable *T = makeReg(IceType_i32);
   4490         _rsbs(T, Src0RHi, Src1RFHi);
   4491         Context.insert<InstFakeUse>(T);
   4492 
   4493         T = makeReg(IceType_i32);
   4494         _rsbs(T, Src0RLo, Src1RFLo, CondARM32::EQ);
   4495         Context.insert<InstFakeUse>(T);
   4496       }
   4497     } else {
   4498       if (TableIcmp64[Condition].IsSigned) {
   4499         _cmp(Src0RLo, Src1RFLo);
   4500         Variable *T = makeReg(IceType_i32);
   4501         _sbcs(T, Src0RHi, Src1RFHi);
   4502         Context.insert<InstFakeUse>(T);
   4503       } else {
   4504         _cmp(Src0RHi, Src1RFHi);
   4505         _cmp(Src0RLo, Src1RFLo, CondARM32::EQ);
   4506       }
   4507     }
   4508 
   4509     return CondWhenTrue(TableIcmp64[Condition].C1);
   4510   }
   4511 
   4512   Variable *Src0RLo, *Src0RHi;
   4513   Operand *Src1RFLo, *Src1RFHi;
   4514   if (TableIcmp64[Condition].Swapped) {
   4515     Src0RLo = legalizeToReg(loOperand(Src1));
   4516     Src0RHi = legalizeToReg(hiOperand(Src1));
   4517     Src1RFLo = legalizeToReg(loOperand(Src0));
   4518     Src1RFHi = legalizeToReg(hiOperand(Src0));
   4519   } else {
   4520     Src0RLo = legalizeToReg(loOperand(Src0));
   4521     Src0RHi = legalizeToReg(hiOperand(Src0));
   4522     Src1RFLo = legalizeToReg(loOperand(Src1));
   4523     Src1RFHi = legalizeToReg(hiOperand(Src1));
   4524   }
   4525 
   4526   // a=icmp cond, b, c ==>
   4527   // GCC does:
   4528   //   cmp      b.hi, c.hi     or  cmp      b.lo, c.lo
   4529   //   cmp.eq   b.lo, c.lo         sbcs t1, b.hi, c.hi
   4530   //   mov.<C1> t, #1              mov.<C1> t, #1
   4531   //   mov.<C2> t, #0              mov.<C2> t, #0
   4532   //   mov      a, t               mov      a, t
   4533   // where the "cmp.eq b.lo, c.lo" is used for unsigned and "sbcs t1, hi, hi"
   4534   // is used for signed compares. In some cases, b and c need to be swapped as
   4535   // well.
   4536   //
   4537   // LLVM does:
   4538   // for EQ and NE:
   4539   //   eor  t1, b.hi, c.hi
   4540   //   eor  t2, b.lo, c.hi
   4541   //   orrs t, t1, t2
   4542   //   mov.<C> t, #1
   4543   //   mov  a, t
   4544   //
   4545   // that's nice in that it's just as short but has fewer dependencies for
   4546   // better ILP at the cost of more registers.
   4547   //
   4548   // Otherwise for signed/unsigned <, <=, etc. LLVM uses a sequence with two
   4549   // unconditional mov #0, two cmps, two conditional mov #1, and one
   4550   // conditional reg mov. That has few dependencies for good ILP, but is a
   4551   // longer sequence.
   4552   //
   4553   // So, we are going with the GCC version since it's usually better (except
   4554   // perhaps for eq/ne). We could revisit special-casing eq/ne later.
   4555   if (TableIcmp64[Condition].IsSigned) {
   4556     Variable *ScratchReg = makeReg(IceType_i32);
   4557     _cmp(Src0RLo, Src1RFLo);
   4558     _sbcs(ScratchReg, Src0RHi, Src1RFHi);
   4559     // ScratchReg isn't going to be used, but we need the side-effect of
   4560     // setting flags from this operation.
   4561     Context.insert<InstFakeUse>(ScratchReg);
   4562   } else {
   4563     _cmp(Src0RHi, Src1RFHi);
   4564     _cmp(Src0RLo, Src1RFLo, CondARM32::EQ);
   4565   }
   4566   return CondWhenTrue(TableIcmp64[Condition].C1);
   4567 }
   4568 
   4569 TargetARM32::CondWhenTrue
   4570 TargetARM32::lowerInt32IcmpCond(InstIcmp::ICond Condition, Operand *Src0,
   4571                                 Operand *Src1) {
   4572   Int32Operands Srcs(Src0, Src1);
   4573   if (!Srcs.hasConstOperand()) {
   4574 
   4575     Variable *Src0R = Srcs.src0R(this);
   4576     Operand *Src1RF = Srcs.src1RF(this);
   4577     _cmp(Src0R, Src1RF);
   4578     return CondWhenTrue(getIcmp32Mapping(Condition));
   4579   }
   4580 
   4581   Variable *Src0R = Srcs.src0R(this);
   4582   const int32_t Value = Srcs.getConstantValue();
   4583   if ((Condition == InstIcmp::Eq || Condition == InstIcmp::Ne) && Value == 0) {
   4584     _tst(Src0R, Src0R);
   4585     return CondWhenTrue(getIcmp32Mapping(Condition));
   4586   }
   4587 
   4588   if (!Srcs.swappedOperands() && !Srcs.immediateIsFlexEncodable() &&
   4589       Srcs.negatedImmediateIsFlexEncodable()) {
   4590     Operand *Src1F = Srcs.negatedSrc1F(this);
   4591     _cmn(Src0R, Src1F);
   4592     return CondWhenTrue(getIcmp32Mapping(Condition));
   4593   }
   4594 
   4595   Operand *Src1RF = Srcs.src1RF(this);
   4596   if (!Srcs.swappedOperands()) {
   4597     _cmp(Src0R, Src1RF);
   4598   } else {
   4599     Variable *T = makeReg(IceType_i32);
   4600     _rsbs(T, Src0R, Src1RF);
   4601     Context.insert<InstFakeUse>(T);
   4602   }
   4603   return CondWhenTrue(getIcmp32Mapping(Condition));
   4604 }
   4605 
   4606 TargetARM32::CondWhenTrue
   4607 TargetARM32::lowerInt8AndInt16IcmpCond(InstIcmp::ICond Condition, Operand *Src0,
   4608                                        Operand *Src1) {
   4609   Int32Operands Srcs(Src0, Src1);
   4610   const int32_t ShAmt = 32 - getScalarIntBitWidth(Src0->getType());
   4611   assert(ShAmt >= 0);
   4612 
   4613   if (!Srcs.hasConstOperand()) {
   4614     Variable *Src0R = makeReg(IceType_i32);
   4615     Operand *ShAmtImm = shAmtImm(ShAmt);
   4616     _lsl(Src0R, legalizeToReg(Src0), ShAmtImm);
   4617 
   4618     Variable *Src1R = legalizeToReg(Src1);
   4619     auto *Src1F = OperandARM32FlexReg::create(Func, IceType_i32, Src1R,
   4620                                               OperandARM32::LSL, ShAmtImm);
   4621     _cmp(Src0R, Src1F);
   4622     return CondWhenTrue(getIcmp32Mapping(Condition));
   4623   }
   4624 
   4625   const int32_t Value = Srcs.getConstantValue();
   4626   if ((Condition == InstIcmp::Eq || Condition == InstIcmp::Ne) && Value == 0) {
   4627     Operand *ShAmtImm = shAmtImm(ShAmt);
   4628     Variable *T = makeReg(IceType_i32);
   4629     _lsls(T, Srcs.src0R(this), ShAmtImm);
   4630     Context.insert<InstFakeUse>(T);
   4631     return CondWhenTrue(getIcmp32Mapping(Condition));
   4632   }
   4633 
   4634   Variable *ConstR = makeReg(IceType_i32);
   4635   _mov(ConstR,
   4636        legalize(Ctx->getConstantInt32(Value << ShAmt), Legal_Reg | Legal_Flex));
   4637   Operand *NonConstF = OperandARM32FlexReg::create(
   4638       Func, IceType_i32, Srcs.src0R(this), OperandARM32::LSL,
   4639       Ctx->getConstantInt32(ShAmt));
   4640 
   4641   if (Srcs.swappedOperands()) {
   4642     _cmp(ConstR, NonConstF);
   4643   } else {
   4644     Variable *T = makeReg(IceType_i32);
   4645     _rsbs(T, ConstR, NonConstF);
   4646     Context.insert<InstFakeUse>(T);
   4647   }
   4648   return CondWhenTrue(getIcmp32Mapping(Condition));
   4649 }
   4650 
   4651 TargetARM32::CondWhenTrue TargetARM32::lowerIcmpCond(const InstIcmp *Instr) {
   4652   return lowerIcmpCond(Instr->getCondition(), Instr->getSrc(0),
   4653                        Instr->getSrc(1));
   4654 }
   4655 
   4656 TargetARM32::CondWhenTrue TargetARM32::lowerIcmpCond(InstIcmp::ICond Condition,
   4657                                                      Operand *Src0,
   4658                                                      Operand *Src1) {
   4659   Src0 = legalizeUndef(Src0);
   4660   Src1 = legalizeUndef(Src1);
   4661 
   4662   // a=icmp cond b, c ==>
   4663   // GCC does:
   4664   //   <u/s>xtb tb, b
   4665   //   <u/s>xtb tc, c
   4666   //   cmp      tb, tc
   4667   //   mov.C1   t, #0
   4668   //   mov.C2   t, #1
   4669   //   mov      a, t
   4670   // where the unsigned/sign extension is not needed for 32-bit. They also have
   4671   // special cases for EQ and NE. E.g., for NE:
   4672   //   <extend to tb, tc>
   4673   //   subs     t, tb, tc
   4674   //   movne    t, #1
   4675   //   mov      a, t
   4676   //
   4677   // LLVM does:
   4678   //   lsl     tb, b, #<N>
   4679   //   mov     t, #0
   4680   //   cmp     tb, c, lsl #<N>
   4681   //   mov.<C> t, #1
   4682   //   mov     a, t
   4683   //
   4684   // the left shift is by 0, 16, or 24, which allows the comparison to focus on
   4685   // the digits that actually matter (for 16-bit or 8-bit signed/unsigned). For
   4686   // the unsigned case, for some reason it does similar to GCC and does a uxtb
   4687   // first. It's not clear to me why that special-casing is needed.
   4688   //
   4689   // We'll go with the LLVM way for now, since it's shorter and has just as few
   4690   // dependencies.
   4691   switch (Src0->getType()) {
   4692   default:
   4693     llvm::report_fatal_error("Unhandled type in lowerIcmpCond");
   4694   case IceType_i1:
   4695   case IceType_i8:
   4696   case IceType_i16:
   4697     return lowerInt8AndInt16IcmpCond(Condition, Src0, Src1);
   4698   case IceType_i32:
   4699     return lowerInt32IcmpCond(Condition, Src0, Src1);
   4700   case IceType_i64:
   4701     return lowerInt64IcmpCond(Condition, Src0, Src1);
   4702   }
   4703 }
   4704 
   4705 void TargetARM32::lowerIcmp(const InstIcmp *Instr) {
   4706   Variable *Dest = Instr->getDest();
   4707   const Type DestTy = Dest->getType();
   4708 
   4709   if (isVectorType(DestTy)) {
   4710     auto *T = makeReg(DestTy);
   4711     auto *Src0 = legalizeToReg(Instr->getSrc(0));
   4712     auto *Src1 = legalizeToReg(Instr->getSrc(1));
   4713     const Type SrcTy = Src0->getType();
   4714 
   4715     bool NeedsShl = false;
   4716     Type NewTypeAfterShl;
   4717     SizeT ShAmt;
   4718     switch (SrcTy) {
   4719     default:
   4720       break;
   4721     case IceType_v16i1:
   4722       NeedsShl = true;
   4723       NewTypeAfterShl = IceType_v16i8;
   4724       ShAmt = 7;
   4725       break;
   4726     case IceType_v8i1:
   4727       NeedsShl = true;
   4728       NewTypeAfterShl = IceType_v8i16;
   4729       ShAmt = 15;
   4730       break;
   4731     case IceType_v4i1:
   4732       NeedsShl = true;
   4733       NewTypeAfterShl = IceType_v4i32;
   4734       ShAmt = 31;
   4735       break;
   4736     }
   4737 
   4738     if (NeedsShl) {
   4739       auto *Imm = llvm::cast<ConstantInteger32>(Ctx->getConstantInt32(ShAmt));
   4740       auto *Src0T = makeReg(NewTypeAfterShl);
   4741       auto *Src0Shl = makeReg(NewTypeAfterShl);
   4742       _mov(Src0T, Src0);
   4743       _vshl(Src0Shl, Src0T, Imm);
   4744       Src0 = Src0Shl;
   4745 
   4746       auto *Src1T = makeReg(NewTypeAfterShl);
   4747       auto *Src1Shl = makeReg(NewTypeAfterShl);
   4748       _mov(Src1T, Src1);
   4749       _vshl(Src1Shl, Src1T, Imm);
   4750       Src1 = Src1Shl;
   4751     }
   4752 
   4753     switch (Instr->getCondition()) {
   4754     default:
   4755       llvm::report_fatal_error("Unhandled integer comparison.");
   4756 #define _Vceq(T, S0, S1, Signed) _vceq(T, S0, S1)
   4757 #define _Vcge(T, S0, S1, Signed)                                               \
   4758   _vcge(T, S0, S1)                                                             \
   4759       ->setSignType(Signed ? InstARM32::FS_Signed : InstARM32::FS_Unsigned)
   4760 #define _Vcgt(T, S0, S1, Signed)                                               \
   4761   _vcgt(T, S0, S1)                                                             \
   4762       ->setSignType(Signed ? InstARM32::FS_Signed : InstARM32::FS_Unsigned)
   4763 #define X(val, is_signed, swapped64, C_32, C1_64, C2_64, C_V, INV_V, NEG_V)    \
   4764   case InstIcmp::val: {                                                        \
   4765     _Vc##C_V(T, (INV_V) ? Src1 : Src0, (INV_V) ? Src0 : Src1, is_signed);      \
   4766     if (NEG_V) {                                                               \
   4767       auto *TInv = makeReg(DestTy);                                            \
   4768       _vmvn(TInv, T);                                                          \
   4769       T = TInv;                                                                \
   4770     }                                                                          \
   4771   } break;
   4772       ICMPARM32_TABLE
   4773 #undef X
   4774 #undef _Vcgt
   4775 #undef _Vcge
   4776 #undef _Vceq
   4777     }
   4778     _mov(Dest, T);
   4779     return;
   4780   }
   4781 
   4782   Operand *_0 =
   4783       legalize(Ctx->getConstantZero(IceType_i32), Legal_Reg | Legal_Flex);
   4784   Operand *_1 = legalize(Ctx->getConstantInt32(1), Legal_Reg | Legal_Flex);
   4785   Variable *T = makeReg(IceType_i1);
   4786 
   4787   _mov(T, _0);
   4788   CondWhenTrue Cond = lowerIcmpCond(Instr);
   4789   _mov_redefined(T, _1, Cond.WhenTrue0);
   4790   _mov(Dest, T);
   4791 
   4792   assert(Cond.WhenTrue1 == CondARM32::kNone);
   4793 
   4794   return;
   4795 }
   4796 
   4797 void TargetARM32::lowerInsertElement(const InstInsertElement *Instr) {
   4798   Variable *Dest = Instr->getDest();
   4799   Type DestTy = Dest->getType();
   4800 
   4801   Variable *Src0 = legalizeToReg(Instr->getSrc(0));
   4802   Variable *Src1 = legalizeToReg(Instr->getSrc(1));
   4803   Operand *Src2 = Instr->getSrc(2);
   4804 
   4805   if (const auto *Imm = llvm::dyn_cast<ConstantInteger32>(Src2)) {
   4806     const uint32_t Index = Imm->getValue();
   4807     Variable *T = makeReg(DestTy);
   4808 
   4809     if (isFloatingType(DestTy)) {
   4810       T->setRegClass(RegARM32::RCARM32_QtoS);
   4811     }
   4812 
   4813     _mov(T, Src0);
   4814     _insertelement(T, Src1, Index);
   4815     _set_dest_redefined();
   4816     _mov(Dest, T);
   4817     return;
   4818   }
   4819   assert(false && "insertelement requires a constant index");
   4820 }
   4821 
   4822 namespace {
   4823 inline uint64_t getConstantMemoryOrder(Operand *Opnd) {
   4824   if (auto *Integer = llvm::dyn_cast<ConstantInteger32>(Opnd))
   4825     return Integer->getValue();
   4826   return Intrinsics::MemoryOrderInvalid;
   4827 }
   4828 } // end of anonymous namespace
   4829 
   4830 void TargetARM32::lowerLoadLinkedStoreExclusive(
   4831     Type Ty, Operand *Addr, std::function<Variable *(Variable *)> Operation,
   4832     CondARM32::Cond Cond) {
   4833 
   4834   auto *Retry = Context.insert<InstARM32Label>(this);
   4835 
   4836   { // scoping for loop highlighting.
   4837     Variable *Success = makeReg(IceType_i32);
   4838     Variable *Tmp = (Ty == IceType_i64) ? makeI64RegPair() : makeReg(Ty);
   4839     auto *_0 = Ctx->getConstantZero(IceType_i32);
   4840 
   4841     Context.insert<InstFakeDef>(Tmp);
   4842     Context.insert<InstFakeUse>(Tmp);
   4843     Variable *AddrR = legalizeToReg(Addr);
   4844     _ldrex(Tmp, formMemoryOperand(AddrR, Ty))->setDestRedefined();
   4845     auto *StoreValue = Operation(Tmp);
   4846     assert(StoreValue->mustHaveReg());
   4847     // strex requires Dest to be a register other than Value or Addr. This
   4848     // restriction is cleanly represented by adding an "early" definition of
   4849     // Dest (or a latter use of all the sources.)
   4850     Context.insert<InstFakeDef>(Success);
   4851     if (Cond != CondARM32::AL) {
   4852       _mov_redefined(Success, legalize(_0, Legal_Reg | Legal_Flex),
   4853                      InstARM32::getOppositeCondition(Cond));
   4854     }
   4855     _strex(Success, StoreValue, formMemoryOperand(AddrR, Ty), Cond)
   4856         ->setDestRedefined();
   4857     _cmp(Success, _0);
   4858   }
   4859 
   4860   _br(Retry, CondARM32::NE);
   4861 }
   4862 
   4863 namespace {
   4864 InstArithmetic *createArithInst(Cfg *Func, uint32_t Operation, Variable *Dest,
   4865                                 Variable *Src0, Operand *Src1) {
   4866   InstArithmetic::OpKind Oper;
   4867   switch (Operation) {
   4868   default:
   4869     llvm::report_fatal_error("Unknown AtomicRMW operation");
   4870   case Intrinsics::AtomicExchange:
   4871     llvm::report_fatal_error("Can't handle Atomic xchg operation");
   4872   case Intrinsics::AtomicAdd:
   4873     Oper = InstArithmetic::Add;
   4874     break;
   4875   case Intrinsics::AtomicAnd:
   4876     Oper = InstArithmetic::And;
   4877     break;
   4878   case Intrinsics::AtomicSub:
   4879     Oper = InstArithmetic::Sub;
   4880     break;
   4881   case Intrinsics::AtomicOr:
   4882     Oper = InstArithmetic::Or;
   4883     break;
   4884   case Intrinsics::AtomicXor:
   4885     Oper = InstArithmetic::Xor;
   4886     break;
   4887   }
   4888   return InstArithmetic::create(Func, Oper, Dest, Src0, Src1);
   4889 }
   4890 } // end of anonymous namespace
   4891 
   4892 void TargetARM32::lowerAtomicRMW(Variable *Dest, uint32_t Operation,
   4893                                  Operand *Addr, Operand *Val) {
   4894   // retry:
   4895   //     ldrex tmp, [addr]
   4896   //     mov contents, tmp
   4897   //     op result, contents, Val
   4898   //     strex success, result, [addr]
   4899   //     cmp success, 0
   4900   //     jne retry
   4901   //     fake-use(addr, operand)  @ prevents undesirable clobbering.
   4902   //     mov dest, contents
   4903   auto DestTy = Dest->getType();
   4904 
   4905   if (DestTy == IceType_i64) {
   4906     lowerInt64AtomicRMW(Dest, Operation, Addr, Val);
   4907     return;
   4908   }
   4909 
   4910   Operand *ValRF = nullptr;
   4911   if (llvm::isa<ConstantInteger32>(Val)) {
   4912     ValRF = Val;
   4913   } else {
   4914     ValRF = legalizeToReg(Val);
   4915   }
   4916   auto *ContentsR = makeReg(DestTy);
   4917   auto *ResultR = makeReg(DestTy);
   4918 
   4919   _dmb();
   4920   lowerLoadLinkedStoreExclusive(
   4921       DestTy, Addr,
   4922       [this, Operation, ResultR, ContentsR, ValRF](Variable *Tmp) {
   4923         lowerAssign(InstAssign::create(Func, ContentsR, Tmp));
   4924         if (Operation == Intrinsics::AtomicExchange) {
   4925           lowerAssign(InstAssign::create(Func, ResultR, ValRF));
   4926         } else {
   4927           lowerArithmetic(
   4928               createArithInst(Func, Operation, ResultR, ContentsR, ValRF));
   4929         }
   4930         return ResultR;
   4931       });
   4932   _dmb();
   4933   if (auto *ValR = llvm::dyn_cast<Variable>(ValRF)) {
   4934     Context.insert<InstFakeUse>(ValR);
   4935   }
   4936   // Can't dce ContentsR.
   4937   Context.insert<InstFakeUse>(ContentsR);
   4938   lowerAssign(InstAssign::create(Func, Dest, ContentsR));
   4939 }
   4940 
   4941 void TargetARM32::lowerInt64AtomicRMW(Variable *Dest, uint32_t Operation,
   4942                                       Operand *Addr, Operand *Val) {
   4943   assert(Dest->getType() == IceType_i64);
   4944 
   4945   auto *ResultR = makeI64RegPair();
   4946 
   4947   Context.insert<InstFakeDef>(ResultR);
   4948 
   4949   Operand *ValRF = nullptr;
   4950   if (llvm::dyn_cast<ConstantInteger64>(Val)) {
   4951     ValRF = Val;
   4952   } else {
   4953     auto *ValR64 = llvm::cast<Variable64On32>(Func->makeVariable(IceType_i64));
   4954     ValR64->initHiLo(Func);
   4955     ValR64->setMustNotHaveReg();
   4956     ValR64->getLo()->setMustHaveReg();
   4957     ValR64->getHi()->setMustHaveReg();
   4958     lowerAssign(InstAssign::create(Func, ValR64, Val));
   4959     ValRF = ValR64;
   4960   }
   4961 
   4962   auto *ContentsR = llvm::cast<Variable64On32>(Func->makeVariable(IceType_i64));
   4963   ContentsR->initHiLo(Func);
   4964   ContentsR->setMustNotHaveReg();
   4965   ContentsR->getLo()->setMustHaveReg();
   4966   ContentsR->getHi()->setMustHaveReg();
   4967 
   4968   _dmb();
   4969   lowerLoadLinkedStoreExclusive(
   4970       IceType_i64, Addr,
   4971       [this, Operation, ResultR, ContentsR, ValRF](Variable *Tmp) {
   4972         lowerAssign(InstAssign::create(Func, ContentsR, Tmp));
   4973         Context.insert<InstFakeUse>(Tmp);
   4974         if (Operation == Intrinsics::AtomicExchange) {
   4975           lowerAssign(InstAssign::create(Func, ResultR, ValRF));
   4976         } else {
   4977           lowerArithmetic(
   4978               createArithInst(Func, Operation, ResultR, ContentsR, ValRF));
   4979         }
   4980         Context.insert<InstFakeUse>(ResultR->getHi());
   4981         Context.insert<InstFakeDef>(ResultR, ResultR->getLo())
   4982             ->setDestRedefined();
   4983         return ResultR;
   4984       });
   4985   _dmb();
   4986   if (auto *ValR64 = llvm::dyn_cast<Variable64On32>(ValRF)) {
   4987     Context.insert<InstFakeUse>(ValR64->getLo());
   4988     Context.insert<InstFakeUse>(ValR64->getHi());
   4989   }
   4990   lowerAssign(InstAssign::create(Func, Dest, ContentsR));
   4991 }
   4992 
   4993 void TargetARM32::postambleCtpop64(const InstCall *Instr) {
   4994   Operand *Arg0 = Instr->getArg(0);
   4995   if (isInt32Asserting32Or64(Arg0->getType())) {
   4996     return;
   4997   }
   4998   // The popcount helpers always return 32-bit values, while the intrinsic's
   4999   // signature matches some 64-bit platform's native instructions and expect to
   5000   // fill a 64-bit reg. Thus, clear the upper bits of the dest just in case the
   5001   // user doesn't do that in the IR or doesn't toss the bits via truncate.
   5002   auto *DestHi = llvm::cast<Variable>(hiOperand(Instr->getDest()));
   5003   Variable *T = makeReg(IceType_i32);
   5004   Operand *_0 =
   5005       legalize(Ctx->getConstantZero(IceType_i32), Legal_Reg | Legal_Flex);
   5006   _mov(T, _0);
   5007   _mov(DestHi, T);
   5008 }
   5009 
   5010 void TargetARM32::lowerIntrinsicCall(const InstIntrinsicCall *Instr) {
   5011   Variable *Dest = Instr->getDest();
   5012   Type DestTy = (Dest != nullptr) ? Dest->getType() : IceType_void;
   5013   Intrinsics::IntrinsicID ID = Instr->getIntrinsicInfo().ID;
   5014   switch (ID) {
   5015   case Intrinsics::AtomicFence:
   5016   case Intrinsics::AtomicFenceAll:
   5017     assert(Dest == nullptr);
   5018     _dmb();
   5019     return;
   5020   case Intrinsics::AtomicIsLockFree: {
   5021     Operand *ByteSize = Instr->getArg(0);
   5022     auto *CI = llvm::dyn_cast<ConstantInteger32>(ByteSize);
   5023     if (CI == nullptr) {
   5024       // The PNaCl ABI requires the byte size to be a compile-time constant.
   5025       Func->setError("AtomicIsLockFree byte size should be compile-time const");
   5026       return;
   5027     }
   5028     static constexpr int32_t NotLockFree = 0;
   5029     static constexpr int32_t LockFree = 1;
   5030     int32_t Result = NotLockFree;
   5031     switch (CI->getValue()) {
   5032     case 1:
   5033     case 2:
   5034     case 4:
   5035     case 8:
   5036       Result = LockFree;
   5037       break;
   5038     }
   5039     _mov(Dest, legalizeToReg(Ctx->getConstantInt32(Result)));
   5040     return;
   5041   }
   5042   case Intrinsics::AtomicLoad: {
   5043     assert(isScalarIntegerType(DestTy));
   5044     // We require the memory address to be naturally aligned. Given that is the
   5045     // case, then normal loads are atomic.
   5046     if (!Intrinsics::isMemoryOrderValid(
   5047             ID, getConstantMemoryOrder(Instr->getArg(1)))) {
   5048       Func->setError("Unexpected memory ordering for AtomicLoad");
   5049       return;
   5050     }
   5051     Variable *T;
   5052 
   5053     if (DestTy == IceType_i64) {
   5054       // ldrex is the only arm instruction that is guaranteed to load a 64-bit
   5055       // integer atomically. Everything else works with a regular ldr.
   5056       T = makeI64RegPair();
   5057       _ldrex(T, formMemoryOperand(Instr->getArg(0), IceType_i64));
   5058     } else {
   5059       T = makeReg(DestTy);
   5060       _ldr(T, formMemoryOperand(Instr->getArg(0), DestTy));
   5061     }
   5062     _dmb();
   5063     lowerAssign(InstAssign::create(Func, Dest, T));
   5064     // Adding a fake-use T to ensure the atomic load is not removed if Dest is
   5065     // unused.
   5066     Context.insert<InstFakeUse>(T);
   5067     return;
   5068   }
   5069   case Intrinsics::AtomicStore: {
   5070     // We require the memory address to be naturally aligned. Given that is the
   5071     // case, then normal loads are atomic.
   5072     if (!Intrinsics::isMemoryOrderValid(
   5073             ID, getConstantMemoryOrder(Instr->getArg(2)))) {
   5074       Func->setError("Unexpected memory ordering for AtomicStore");
   5075       return;
   5076     }
   5077 
   5078     auto *Value = Instr->getArg(0);
   5079     if (Value->getType() == IceType_i64) {
   5080       auto *ValueR = makeI64RegPair();
   5081       Context.insert<InstFakeDef>(ValueR);
   5082       lowerAssign(InstAssign::create(Func, ValueR, Value));
   5083       _dmb();
   5084       lowerLoadLinkedStoreExclusive(
   5085           IceType_i64, Instr->getArg(1), [this, ValueR](Variable *Tmp) {
   5086             // The following fake-use prevents the ldrex instruction from being
   5087             // dead code eliminated.
   5088             Context.insert<InstFakeUse>(llvm::cast<Variable>(loOperand(Tmp)));
   5089             Context.insert<InstFakeUse>(llvm::cast<Variable>(hiOperand(Tmp)));
   5090             Context.insert<InstFakeUse>(Tmp);
   5091             return ValueR;
   5092           });
   5093       Context.insert<InstFakeUse>(ValueR);
   5094       _dmb();
   5095       return;
   5096     }
   5097 
   5098     auto *ValueR = legalizeToReg(Instr->getArg(0));
   5099     const auto ValueTy = ValueR->getType();
   5100     assert(isScalarIntegerType(ValueTy));
   5101     auto *Addr = legalizeToReg(Instr->getArg(1));
   5102 
   5103     // non-64-bit stores are atomically as long as the address is aligned. This
   5104     // is PNaCl, so addresses are aligned.
   5105     _dmb();
   5106     _str(ValueR, formMemoryOperand(Addr, ValueTy));
   5107     _dmb();
   5108     return;
   5109   }
   5110   case Intrinsics::AtomicCmpxchg: {
   5111     // retry:
   5112     //     ldrex tmp, [addr]
   5113     //     cmp tmp, expected
   5114     //     mov expected, tmp
   5115     //     strexeq success, new, [addr]
   5116     //     cmpeq success, #0
   5117     //     bne retry
   5118     //     mov dest, expected
   5119     assert(isScalarIntegerType(DestTy));
   5120     // We require the memory address to be naturally aligned. Given that is the
   5121     // case, then normal loads are atomic.
   5122     if (!Intrinsics::isMemoryOrderValid(
   5123             ID, getConstantMemoryOrder(Instr->getArg(3)),
   5124             getConstantMemoryOrder(Instr->getArg(4)))) {
   5125       Func->setError("Unexpected memory ordering for AtomicCmpxchg");
   5126       return;
   5127     }
   5128 
   5129     if (DestTy == IceType_i64) {
   5130       Variable *LoadedValue = nullptr;
   5131 
   5132       auto *New = makeI64RegPair();
   5133       Context.insert<InstFakeDef>(New);
   5134       lowerAssign(InstAssign::create(Func, New, Instr->getArg(2)));
   5135 
   5136       auto *Expected = makeI64RegPair();
   5137       Context.insert<InstFakeDef>(Expected);
   5138       lowerAssign(InstAssign::create(Func, Expected, Instr->getArg(1)));
   5139 
   5140       _dmb();
   5141       lowerLoadLinkedStoreExclusive(
   5142           DestTy, Instr->getArg(0),
   5143           [this, Expected, New, Instr, DestTy, &LoadedValue](Variable *Tmp) {
   5144             auto *ExpectedLoR = llvm::cast<Variable>(loOperand(Expected));
   5145             auto *ExpectedHiR = llvm::cast<Variable>(hiOperand(Expected));
   5146             auto *TmpLoR = llvm::cast<Variable>(loOperand(Tmp));
   5147             auto *TmpHiR = llvm::cast<Variable>(hiOperand(Tmp));
   5148             _cmp(TmpLoR, ExpectedLoR);
   5149             _cmp(TmpHiR, ExpectedHiR, CondARM32::EQ);
   5150             LoadedValue = Tmp;
   5151             return New;
   5152           },
   5153           CondARM32::EQ);
   5154       _dmb();
   5155 
   5156       Context.insert<InstFakeUse>(LoadedValue);
   5157       lowerAssign(InstAssign::create(Func, Dest, LoadedValue));
   5158       // The fake-use Expected prevents the assignments to Expected (above)
   5159       // from being removed if Dest is not used.
   5160       Context.insert<InstFakeUse>(Expected);
   5161       // New needs to be alive here, or its live range will end in the
   5162       // strex instruction.
   5163       Context.insert<InstFakeUse>(New);
   5164       return;
   5165     }
   5166 
   5167     auto *New = legalizeToReg(Instr->getArg(2));
   5168     auto *Expected = legalizeToReg(Instr->getArg(1));
   5169     Variable *LoadedValue = nullptr;
   5170 
   5171     _dmb();
   5172     lowerLoadLinkedStoreExclusive(
   5173         DestTy, Instr->getArg(0),
   5174         [this, Expected, New, Instr, DestTy, &LoadedValue](Variable *Tmp) {
   5175           lowerIcmpCond(InstIcmp::Eq, Tmp, Expected);
   5176           LoadedValue = Tmp;
   5177           return New;
   5178         },
   5179         CondARM32::EQ);
   5180     _dmb();
   5181 
   5182     lowerAssign(InstAssign::create(Func, Dest, LoadedValue));
   5183     Context.insert<InstFakeUse>(Expected);
   5184     Context.insert<InstFakeUse>(New);
   5185     return;
   5186   }
   5187   case Intrinsics::AtomicRMW: {
   5188     if (!Intrinsics::isMemoryOrderValid(
   5189             ID, getConstantMemoryOrder(Instr->getArg(3)))) {
   5190       Func->setError("Unexpected memory ordering for AtomicRMW");
   5191       return;
   5192     }
   5193     lowerAtomicRMW(
   5194         Dest, static_cast<uint32_t>(
   5195                   llvm::cast<ConstantInteger32>(Instr->getArg(0))->getValue()),
   5196         Instr->getArg(1), Instr->getArg(2));
   5197     return;
   5198   }
   5199   case Intrinsics::Bswap: {
   5200     Operand *Val = Instr->getArg(0);
   5201     Type Ty = Val->getType();
   5202     if (Ty == IceType_i64) {
   5203       Val = legalizeUndef(Val);
   5204       Variable *Val_Lo = legalizeToReg(loOperand(Val));
   5205       Variable *Val_Hi = legalizeToReg(hiOperand(Val));
   5206       Variable *T_Lo = makeReg(IceType_i32);
   5207       Variable *T_Hi = makeReg(IceType_i32);
   5208       auto *DestLo = llvm::cast<Variable>(loOperand(Dest));
   5209       auto *DestHi = llvm::cast<Variable>(hiOperand(Dest));
   5210       _rev(T_Lo, Val_Lo);
   5211       _rev(T_Hi, Val_Hi);
   5212       _mov(DestLo, T_Hi);
   5213       _mov(DestHi, T_Lo);
   5214     } else {
   5215       assert(Ty == IceType_i32 || Ty == IceType_i16);
   5216       Variable *ValR = legalizeToReg(Val);
   5217       Variable *T = makeReg(Ty);
   5218       _rev(T, ValR);
   5219       if (Val->getType() == IceType_i16) {
   5220         Operand *_16 = shAmtImm(16);
   5221         _lsr(T, T, _16);
   5222       }
   5223       _mov(Dest, T);
   5224     }
   5225     return;
   5226   }
   5227   case Intrinsics::Ctpop: {
   5228     llvm::report_fatal_error("Ctpop should have been prelowered.");
   5229   }
   5230   case Intrinsics::Ctlz: {
   5231     // The "is zero undef" parameter is ignored and we always return a
   5232     // well-defined value.
   5233     Operand *Val = Instr->getArg(0);
   5234     Variable *ValLoR;
   5235     Variable *ValHiR = nullptr;
   5236     if (Val->getType() == IceType_i64) {
   5237       Val = legalizeUndef(Val);
   5238       ValLoR = legalizeToReg(loOperand(Val));
   5239       ValHiR = legalizeToReg(hiOperand(Val));
   5240     } else {
   5241       ValLoR = legalizeToReg(Val);
   5242     }
   5243     lowerCLZ(Dest, ValLoR, ValHiR);
   5244     return;
   5245   }
   5246   case Intrinsics::Cttz: {
   5247     // Essentially like Clz, but reverse the bits first.
   5248     Operand *Val = Instr->getArg(0);
   5249     Variable *ValLoR;
   5250     Variable *ValHiR = nullptr;
   5251     if (Val->getType() == IceType_i64) {
   5252       Val = legalizeUndef(Val);
   5253       ValLoR = legalizeToReg(loOperand(Val));
   5254       ValHiR = legalizeToReg(hiOperand(Val));
   5255       Variable *TLo = makeReg(IceType_i32);
   5256       Variable *THi = makeReg(IceType_i32);
   5257       _rbit(TLo, ValLoR);
   5258       _rbit(THi, ValHiR);
   5259       ValLoR = THi;
   5260       ValHiR = TLo;
   5261     } else {
   5262       ValLoR = legalizeToReg(Val);
   5263       Variable *T = makeReg(IceType_i32);
   5264       _rbit(T, ValLoR);
   5265       ValLoR = T;
   5266     }
   5267     lowerCLZ(Dest, ValLoR, ValHiR);
   5268     return;
   5269   }
   5270   case Intrinsics::Fabs: {
   5271     Variable *T = makeReg(DestTy);
   5272     _vabs(T, legalizeToReg(Instr->getArg(0)));
   5273     _mov(Dest, T);
   5274     return;
   5275   }
   5276   case Intrinsics::Longjmp: {
   5277     llvm::report_fatal_error("longjmp should have been prelowered.");
   5278   }
   5279   case Intrinsics::Memcpy: {
   5280     llvm::report_fatal_error("memcpy should have been prelowered.");
   5281   }
   5282   case Intrinsics::Memmove: {
   5283     llvm::report_fatal_error("memmove should have been prelowered.");
   5284   }
   5285   case Intrinsics::Memset: {
   5286     llvm::report_fatal_error("memmove should have been prelowered.");
   5287   }
   5288   case Intrinsics::NaClReadTP: {
   5289     if (SandboxingType != ST_NaCl) {
   5290       llvm::report_fatal_error("nacl-read-tp should have been prelowered.");
   5291     }
   5292     Variable *TP = legalizeToReg(OperandARM32Mem::create(
   5293         Func, getPointerType(), getPhysicalRegister(RegARM32::Reg_r9),
   5294         llvm::cast<ConstantInteger32>(Ctx->getConstantZero(IceType_i32))));
   5295     _mov(Dest, TP);
   5296     return;
   5297   }
   5298   case Intrinsics::Setjmp: {
   5299     llvm::report_fatal_error("setjmp should have been prelowered.");
   5300   }
   5301   case Intrinsics::Sqrt: {
   5302     assert(isScalarFloatingType(Dest->getType()) ||
   5303            getFlags().getApplicationBinaryInterface() != ::Ice::ABI_PNaCl);
   5304     Variable *Src = legalizeToReg(Instr->getArg(0));
   5305     Variable *T = makeReg(DestTy);
   5306     _vsqrt(T, Src);
   5307     _mov(Dest, T);
   5308     return;
   5309   }
   5310   case Intrinsics::Stacksave: {
   5311     Variable *SP = getPhysicalRegister(RegARM32::Reg_sp);
   5312     _mov(Dest, SP);
   5313     return;
   5314   }
   5315   case Intrinsics::Stackrestore: {
   5316     Variable *Val = legalizeToReg(Instr->getArg(0));
   5317     Sandboxer(this).reset_sp(Val);
   5318     return;
   5319   }
   5320   case Intrinsics::Trap:
   5321     _trap();
   5322     return;
   5323   case Intrinsics::AddSaturateSigned:
   5324   case Intrinsics::AddSaturateUnsigned: {
   5325     bool Unsigned = (ID == Intrinsics::AddSaturateUnsigned);
   5326     Variable *Src0 = legalizeToReg(Instr->getArg(0));
   5327     Variable *Src1 = legalizeToReg(Instr->getArg(1));
   5328     Variable *T = makeReg(DestTy);
   5329     _vqadd(T, Src0, Src1, Unsigned);
   5330     _mov(Dest, T);
   5331     return;
   5332   }
   5333   case Intrinsics::LoadSubVector: {
   5334     assert(llvm::isa<ConstantInteger32>(Instr->getArg(1)) &&
   5335            "LoadSubVector second argument must be a constant");
   5336     Variable *Dest = Instr->getDest();
   5337     Type Ty = Dest->getType();
   5338     auto *SubVectorSize = llvm::cast<ConstantInteger32>(Instr->getArg(1));
   5339     Operand *Addr = Instr->getArg(0);
   5340     OperandARM32Mem *Src = formMemoryOperand(Addr, Ty);
   5341     doMockBoundsCheck(Src);
   5342 
   5343     if (Dest->isRematerializable()) {
   5344       Context.insert<InstFakeDef>(Dest);
   5345       return;
   5346     }
   5347 
   5348     auto *T = makeReg(Ty);
   5349     switch (SubVectorSize->getValue()) {
   5350     case 4:
   5351       _vldr1d(T, Src);
   5352       break;
   5353     case 8:
   5354       _vldr1q(T, Src);
   5355       break;
   5356     default:
   5357       Func->setError("Unexpected size for LoadSubVector");
   5358       return;
   5359     }
   5360     _mov(Dest, T);
   5361     return;
   5362   }
   5363   case Intrinsics::StoreSubVector: {
   5364     assert(llvm::isa<ConstantInteger32>(Instr->getArg(2)) &&
   5365            "StoreSubVector third argument must be a constant");
   5366     auto *SubVectorSize = llvm::cast<ConstantInteger32>(Instr->getArg(2));
   5367     Variable *Value = legalizeToReg(Instr->getArg(0));
   5368     Operand *Addr = Instr->getArg(1);
   5369     OperandARM32Mem *NewAddr = formMemoryOperand(Addr, Value->getType());
   5370     doMockBoundsCheck(NewAddr);
   5371 
   5372     Value = legalizeToReg(Value);
   5373 
   5374     switch (SubVectorSize->getValue()) {
   5375     case 4:
   5376       _vstr1d(Value, NewAddr);
   5377       break;
   5378     case 8:
   5379       _vstr1q(Value, NewAddr);
   5380       break;
   5381     default:
   5382       Func->setError("Unexpected size for StoreSubVector");
   5383       return;
   5384     }
   5385     return;
   5386   }
   5387   case Intrinsics::MultiplyAddPairs: {
   5388     Variable *Src0 = legalizeToReg(Instr->getArg(0));
   5389     Variable *Src1 = legalizeToReg(Instr->getArg(1));
   5390     Variable *T = makeReg(DestTy);
   5391     _vmlap(T, Src0, Src1);
   5392     _mov(Dest, T);
   5393     return;
   5394   }
   5395   case Intrinsics::MultiplyHighSigned:
   5396   case Intrinsics::MultiplyHighUnsigned: {
   5397     bool Unsigned = (ID == Intrinsics::MultiplyHighUnsigned);
   5398     Variable *Src0 = legalizeToReg(Instr->getArg(0));
   5399     Variable *Src1 = legalizeToReg(Instr->getArg(1));
   5400     Variable *T = makeReg(DestTy);
   5401     _vmulh(T, Src0, Src1, Unsigned);
   5402     _mov(Dest, T);
   5403     return;
   5404   }
   5405   case Intrinsics::Nearbyint: {
   5406     UnimplementedLoweringError(this, Instr);
   5407     return;
   5408   }
   5409   case Intrinsics::Round: {
   5410     UnimplementedLoweringError(this, Instr);
   5411     return;
   5412   }
   5413   case Intrinsics::SignMask: {
   5414     UnimplementedLoweringError(this, Instr);
   5415     return;
   5416   }
   5417   case Intrinsics::SubtractSaturateSigned:
   5418   case Intrinsics::SubtractSaturateUnsigned: {
   5419     bool Unsigned = (ID == Intrinsics::SubtractSaturateUnsigned);
   5420     Variable *Src0 = legalizeToReg(Instr->getArg(0));
   5421     Variable *Src1 = legalizeToReg(Instr->getArg(1));
   5422     Variable *T = makeReg(DestTy);
   5423     _vqsub(T, Src0, Src1, Unsigned);
   5424     _mov(Dest, T);
   5425     return;
   5426   }
   5427   case Intrinsics::VectorPackSigned:
   5428   case Intrinsics::VectorPackUnsigned: {
   5429     bool Unsigned = (ID == Intrinsics::VectorPackUnsigned);
   5430     bool Saturating = true;
   5431     Variable *Src0 = legalizeToReg(Instr->getArg(0));
   5432     Variable *Src1 = legalizeToReg(Instr->getArg(1));
   5433     Variable *T = makeReg(DestTy);
   5434     _vqmovn2(T, Src0, Src1, Unsigned, Saturating);
   5435     _mov(Dest, T);
   5436     return;
   5437   }
   5438   default: // UnknownIntrinsic
   5439     Func->setError("Unexpected intrinsic");
   5440     return;
   5441   }
   5442   return;
   5443 }
   5444 
   5445 void TargetARM32::lowerCLZ(Variable *Dest, Variable *ValLoR, Variable *ValHiR) {
   5446   Type Ty = Dest->getType();
   5447   assert(Ty == IceType_i32 || Ty == IceType_i64);
   5448   Variable *T = makeReg(IceType_i32);
   5449   _clz(T, ValLoR);
   5450   if (Ty == IceType_i64) {
   5451     auto *DestLo = llvm::cast<Variable>(loOperand(Dest));
   5452     auto *DestHi = llvm::cast<Variable>(hiOperand(Dest));
   5453     Operand *Zero =
   5454         legalize(Ctx->getConstantZero(IceType_i32), Legal_Reg | Legal_Flex);
   5455     Operand *ThirtyTwo =
   5456         legalize(Ctx->getConstantInt32(32), Legal_Reg | Legal_Flex);
   5457     _cmp(ValHiR, Zero);
   5458     Variable *T2 = makeReg(IceType_i32);
   5459     _add(T2, T, ThirtyTwo);
   5460     _clz(T2, ValHiR, CondARM32::NE);
   5461     // T2 is actually a source as well when the predicate is not AL (since it
   5462     // may leave T2 alone). We use _set_dest_redefined to prolong the liveness
   5463     // of T2 as if it was used as a source.
   5464     _set_dest_redefined();
   5465     _mov(DestLo, T2);
   5466     Variable *T3 = makeReg(Zero->getType());
   5467     _mov(T3, Zero);
   5468     _mov(DestHi, T3);
   5469     return;
   5470   }
   5471   _mov(Dest, T);
   5472   return;
   5473 }
   5474 
   5475 void TargetARM32::lowerLoad(const InstLoad *Load) {
   5476   // A Load instruction can be treated the same as an Assign instruction, after
   5477   // the source operand is transformed into an OperandARM32Mem operand.
   5478   Type Ty = Load->getDest()->getType();
   5479   Operand *Src0 = formMemoryOperand(Load->getSourceAddress(), Ty);
   5480   Variable *DestLoad = Load->getDest();
   5481 
   5482   // TODO(jvoung): handled folding opportunities. Sign and zero extension can
   5483   // be folded into a load.
   5484   auto *Assign = InstAssign::create(Func, DestLoad, Src0);
   5485   lowerAssign(Assign);
   5486 }
   5487 
   5488 namespace {
   5489 void dumpAddressOpt(const Cfg *Func, const Variable *Base, int32_t Offset,
   5490                     const Variable *OffsetReg, int16_t OffsetRegShAmt,
   5491                     const Inst *Reason) {
   5492   if (!BuildDefs::dump())
   5493     return;
   5494   if (!Func->isVerbose(IceV_AddrOpt))
   5495     return;
   5496   OstreamLocker _(Func->getContext());
   5497   Ostream &Str = Func->getContext()->getStrDump();
   5498   Str << "Instruction: ";
   5499   Reason->dumpDecorated(Func);
   5500   Str << "  results in Base=";
   5501   if (Base)
   5502     Base->dump(Func);
   5503   else
   5504     Str << "<null>";
   5505   Str << ", OffsetReg=";
   5506   if (OffsetReg)
   5507     OffsetReg->dump(Func);
   5508   else
   5509     Str << "<null>";
   5510   Str << ", Shift=" << OffsetRegShAmt << ", Offset=" << Offset << "\n";
   5511 }
   5512 
   5513 bool matchAssign(const VariablesMetadata *VMetadata, Variable **Var,
   5514                  int32_t *Offset, const Inst **Reason) {
   5515   // Var originates from Var=SrcVar ==> set Var:=SrcVar
   5516   if (*Var == nullptr)
   5517     return false;
   5518   const Inst *VarAssign = VMetadata->getSingleDefinition(*Var);
   5519   if (!VarAssign)
   5520     return false;
   5521   assert(!VMetadata->isMultiDef(*Var));
   5522   if (!llvm::isa<InstAssign>(VarAssign))
   5523     return false;
   5524 
   5525   Operand *SrcOp = VarAssign->getSrc(0);
   5526   bool Optimized = false;
   5527   if (auto *SrcVar = llvm::dyn_cast<Variable>(SrcOp)) {
   5528     if (!VMetadata->isMultiDef(SrcVar) ||
   5529         // TODO: ensure SrcVar stays single-BB
   5530         false) {
   5531       Optimized = true;
   5532       *Var = SrcVar;
   5533     } else if (auto *Const = llvm::dyn_cast<ConstantInteger32>(SrcOp)) {
   5534       int32_t MoreOffset = Const->getValue();
   5535       int32_t NewOffset = MoreOffset + *Offset;
   5536       if (Utils::WouldOverflowAdd(*Offset, MoreOffset))
   5537         return false;
   5538       *Var = nullptr;
   5539       *Offset += NewOffset;
   5540       Optimized = true;
   5541     }
   5542   }
   5543 
   5544   if (Optimized) {
   5545     *Reason = VarAssign;
   5546   }
   5547 
   5548   return Optimized;
   5549 }
   5550 
   5551 bool isAddOrSub(const Inst *Instr, InstArithmetic::OpKind *Kind) {
   5552   if (const auto *Arith = llvm::dyn_cast<InstArithmetic>(Instr)) {
   5553     switch (Arith->getOp()) {
   5554     default:
   5555       return false;
   5556     case InstArithmetic::Add:
   5557     case InstArithmetic::Sub:
   5558       *Kind = Arith->getOp();
   5559       return true;
   5560     }
   5561   }
   5562   return false;
   5563 }
   5564 
   5565 bool matchCombinedBaseIndex(const VariablesMetadata *VMetadata, Variable **Base,
   5566                             Variable **OffsetReg, int32_t OffsetRegShamt,
   5567                             const Inst **Reason) {
   5568   // OffsetReg==nullptr && Base is Base=Var1+Var2 ==>
   5569   //   set Base=Var1, OffsetReg=Var2, Shift=0
   5570   if (*Base == nullptr)
   5571     return false;
   5572   if (*OffsetReg != nullptr)
   5573     return false;
   5574   (void)OffsetRegShamt;
   5575   assert(OffsetRegShamt == 0);
   5576   const Inst *BaseInst = VMetadata->getSingleDefinition(*Base);
   5577   if (BaseInst == nullptr)
   5578     return false;
   5579   assert(!VMetadata->isMultiDef(*Base));
   5580   if (BaseInst->getSrcSize() < 2)
   5581     return false;
   5582   auto *Var1 = llvm::dyn_cast<Variable>(BaseInst->getSrc(0));
   5583   if (!Var1)
   5584     return false;
   5585   if (VMetadata->isMultiDef(Var1))
   5586     return false;
   5587   auto *Var2 = llvm::dyn_cast<Variable>(BaseInst->getSrc(1));
   5588   if (!Var2)
   5589     return false;
   5590   if (VMetadata->isMultiDef(Var2))
   5591     return false;
   5592   InstArithmetic::OpKind _;
   5593   if (!isAddOrSub(BaseInst, &_) ||
   5594       // TODO: ensure Var1 and Var2 stay single-BB
   5595       false)
   5596     return false;
   5597   *Base = Var1;
   5598   *OffsetReg = Var2;
   5599   // OffsetRegShamt is already 0.
   5600   *Reason = BaseInst;
   5601   return true;
   5602 }
   5603 
   5604 bool matchShiftedOffsetReg(const VariablesMetadata *VMetadata,
   5605                            Variable **OffsetReg, OperandARM32::ShiftKind *Kind,
   5606                            int32_t *OffsetRegShamt, const Inst **Reason) {
   5607   // OffsetReg is OffsetReg=Var*Const && log2(Const)+Shift<=32 ==>
   5608   //   OffsetReg=Var, Shift+=log2(Const)
   5609   // OffsetReg is OffsetReg=Var<<Const && Const+Shift<=32 ==>
   5610   //   OffsetReg=Var, Shift+=Const
   5611   // OffsetReg is OffsetReg=Var>>Const && Const-Shift>=-32 ==>
   5612   //   OffsetReg=Var, Shift-=Const
   5613   OperandARM32::ShiftKind NewShiftKind = OperandARM32::kNoShift;
   5614   if (*OffsetReg == nullptr)
   5615     return false;
   5616   auto *IndexInst = VMetadata->getSingleDefinition(*OffsetReg);
   5617   if (IndexInst == nullptr)
   5618     return false;
   5619   assert(!VMetadata->isMultiDef(*OffsetReg));
   5620   if (IndexInst->getSrcSize() < 2)
   5621     return false;
   5622   auto *ArithInst = llvm::dyn_cast<InstArithmetic>(IndexInst);
   5623   if (ArithInst == nullptr)
   5624     return false;
   5625   auto *Var = llvm::dyn_cast<Variable>(ArithInst->getSrc(0));
   5626   if (Var == nullptr)
   5627     return false;
   5628   auto *Const = llvm::dyn_cast<ConstantInteger32>(ArithInst->getSrc(1));
   5629   if (Const == nullptr) {
   5630     assert(!llvm::isa<ConstantInteger32>(ArithInst->getSrc(0)));
   5631     return false;
   5632   }
   5633   if (VMetadata->isMultiDef(Var) || Const->getType() != IceType_i32)
   5634     return false;
   5635 
   5636   uint32_t NewShamt = -1;
   5637   switch (ArithInst->getOp()) {
   5638   default:
   5639     return false;
   5640   case InstArithmetic::Shl: {
   5641     NewShiftKind = OperandARM32::LSL;
   5642     NewShamt = Const->getValue();
   5643     if (NewShamt > 31)
   5644       return false;
   5645   } break;
   5646   case InstArithmetic::Lshr: {
   5647     NewShiftKind = OperandARM32::LSR;
   5648     NewShamt = Const->getValue();
   5649     if (NewShamt > 31)
   5650       return false;
   5651   } break;
   5652   case InstArithmetic::Ashr: {
   5653     NewShiftKind = OperandARM32::ASR;
   5654     NewShamt = Const->getValue();
   5655     if (NewShamt > 31)
   5656       return false;
   5657   } break;
   5658   case InstArithmetic::Udiv:
   5659   case InstArithmetic::Mul: {
   5660     const uint32_t UnsignedConst = Const->getValue();
   5661     NewShamt = llvm::findFirstSet(UnsignedConst);
   5662     if (NewShamt != llvm::findLastSet(UnsignedConst)) {
   5663       // First bit set is not the same as the last bit set, so Const is not
   5664       // a power of 2.
   5665       return false;
   5666     }
   5667     NewShiftKind = ArithInst->getOp() == InstArithmetic::Udiv
   5668                        ? OperandARM32::LSR
   5669                        : OperandARM32::LSL;
   5670   } break;
   5671   }
   5672   // Allowed "transitions":
   5673   //   kNoShift -> * iff NewShamt < 31
   5674   //   LSL -> LSL    iff NewShamt + OffsetRegShamt < 31
   5675   //   LSR -> LSR    iff NewShamt + OffsetRegShamt < 31
   5676   //   ASR -> ASR    iff NewShamt + OffsetRegShamt < 31
   5677   if (*Kind != OperandARM32::kNoShift && *Kind != NewShiftKind) {
   5678     return false;
   5679   }
   5680   const int32_t NewOffsetRegShamt = *OffsetRegShamt + NewShamt;
   5681   if (NewOffsetRegShamt > 31)
   5682     return false;
   5683   *OffsetReg = Var;
   5684   *OffsetRegShamt = NewOffsetRegShamt;
   5685   *Kind = NewShiftKind;
   5686   *Reason = IndexInst;
   5687   return true;
   5688 }
   5689 
   5690 bool matchOffsetBase(const VariablesMetadata *VMetadata, Variable **Base,
   5691                      int32_t *Offset, const Inst **Reason) {
   5692   // Base is Base=Var+Const || Base is Base=Const+Var ==>
   5693   //   set Base=Var, Offset+=Const
   5694   // Base is Base=Var-Const ==>
   5695   //   set Base=Var, Offset-=Const
   5696   if (*Base == nullptr)
   5697     return false;
   5698   const Inst *BaseInst = VMetadata->getSingleDefinition(*Base);
   5699   if (BaseInst == nullptr) {
   5700     return false;
   5701   }
   5702   assert(!VMetadata->isMultiDef(*Base));
   5703 
   5704   auto *ArithInst = llvm::dyn_cast<const InstArithmetic>(BaseInst);
   5705   if (ArithInst == nullptr)
   5706     return false;
   5707   InstArithmetic::OpKind Kind;
   5708   if (!isAddOrSub(ArithInst, &Kind))
   5709     return false;
   5710   bool IsAdd = Kind == InstArithmetic::Add;
   5711   Operand *Src0 = ArithInst->getSrc(0);
   5712   Operand *Src1 = ArithInst->getSrc(1);
   5713   auto *Var0 = llvm::dyn_cast<Variable>(Src0);
   5714   auto *Var1 = llvm::dyn_cast<Variable>(Src1);
   5715   auto *Const0 = llvm::dyn_cast<ConstantInteger32>(Src0);
   5716   auto *Const1 = llvm::dyn_cast<ConstantInteger32>(Src1);
   5717   Variable *NewBase = nullptr;
   5718   int32_t NewOffset = *Offset;
   5719 
   5720   if (Var0 == nullptr && Const0 == nullptr) {
   5721     assert(llvm::isa<ConstantRelocatable>(Src0));
   5722     return false;
   5723   }
   5724 
   5725   if (Var1 == nullptr && Const1 == nullptr) {
   5726     assert(llvm::isa<ConstantRelocatable>(Src1));
   5727     return false;
   5728   }
   5729 
   5730   if (Var0 && Var1)
   5731     // TODO(jpp): merge base/index splitting into here.
   5732     return false;
   5733   if (!IsAdd && Var1)
   5734     return false;
   5735   if (Var0)
   5736     NewBase = Var0;
   5737   else if (Var1)
   5738     NewBase = Var1;
   5739   // Compute the updated constant offset.
   5740   if (Const0) {
   5741     int32_t MoreOffset = IsAdd ? Const0->getValue() : -Const0->getValue();
   5742     if (Utils::WouldOverflowAdd(NewOffset, MoreOffset))
   5743       return false;
   5744     NewOffset += MoreOffset;
   5745   }
   5746   if (Const1) {
   5747     int32_t MoreOffset = IsAdd ? Const1->getValue() : -Const1->getValue();
   5748     if (Utils::WouldOverflowAdd(NewOffset, MoreOffset))
   5749       return false;
   5750     NewOffset += MoreOffset;
   5751   }
   5752 
   5753   // Update the computed address parameters once we are sure optimization
   5754   // is valid.
   5755   *Base = NewBase;
   5756   *Offset = NewOffset;
   5757   *Reason = BaseInst;
   5758   return true;
   5759 }
   5760 } // end of anonymous namespace
   5761 
   5762 OperandARM32Mem *TargetARM32::formAddressingMode(Type Ty, Cfg *Func,
   5763                                                  const Inst *LdSt,
   5764                                                  Operand *Base) {
   5765   assert(Base != nullptr);
   5766   int32_t OffsetImm = 0;
   5767   Variable *OffsetReg = nullptr;
   5768   int32_t OffsetRegShamt = 0;
   5769   OperandARM32::ShiftKind ShiftKind = OperandARM32::kNoShift;
   5770 
   5771   Func->resetCurrentNode();
   5772   if (Func->isVerbose(IceV_AddrOpt)) {
   5773     OstreamLocker _(Func->getContext());
   5774     Ostream &Str = Func->getContext()->getStrDump();
   5775     Str << "\nAddress mode formation:\t";
   5776     LdSt->dumpDecorated(Func);
   5777   }
   5778 
   5779   if (isVectorType(Ty))
   5780     // vector loads and stores do not allow offsets, and only support the
   5781     // "[reg]" addressing mode (the other supported modes are write back.)
   5782     return nullptr;
   5783 
   5784   auto *BaseVar = llvm::dyn_cast<Variable>(Base);
   5785   if (BaseVar == nullptr)
   5786     return nullptr;
   5787 
   5788   (void)MemTraitsSize;
   5789   assert(Ty < MemTraitsSize);
   5790   auto *TypeTraits = &MemTraits[Ty];
   5791   const bool CanHaveIndex = !NeedSandboxing && TypeTraits->CanHaveIndex;
   5792   const bool CanHaveShiftedIndex =
   5793       !NeedSandboxing && TypeTraits->CanHaveShiftedIndex;
   5794   const bool CanHaveImm = TypeTraits->CanHaveImm;
   5795   const int32_t ValidImmMask = TypeTraits->ValidImmMask;
   5796   (void)ValidImmMask;
   5797   assert(!CanHaveImm || ValidImmMask >= 0);
   5798 
   5799   const VariablesMetadata *VMetadata = Func->getVMetadata();
   5800   const Inst *Reason = nullptr;
   5801 
   5802   do {
   5803     if (Reason != nullptr) {
   5804       dumpAddressOpt(Func, BaseVar, OffsetImm, OffsetReg, OffsetRegShamt,
   5805                      Reason);
   5806       Reason = nullptr;
   5807     }
   5808 
   5809     if (matchAssign(VMetadata, &BaseVar, &OffsetImm, &Reason)) {
   5810       continue;
   5811     }
   5812 
   5813     if (CanHaveIndex &&
   5814         matchAssign(VMetadata, &OffsetReg, &OffsetImm, &Reason)) {
   5815       continue;
   5816     }
   5817 
   5818     if (CanHaveIndex && matchCombinedBaseIndex(VMetadata, &BaseVar, &OffsetReg,
   5819                                                OffsetRegShamt, &Reason)) {
   5820       continue;
   5821     }
   5822 
   5823     if (CanHaveShiftedIndex) {
   5824       if (matchShiftedOffsetReg(VMetadata, &OffsetReg, &ShiftKind,
   5825                                 &OffsetRegShamt, &Reason)) {
   5826         continue;
   5827       }
   5828 
   5829       if ((OffsetRegShamt == 0) &&
   5830           matchShiftedOffsetReg(VMetadata, &BaseVar, &ShiftKind,
   5831                                 &OffsetRegShamt, &Reason)) {
   5832         std::swap(BaseVar, OffsetReg);
   5833         continue;
   5834       }
   5835     }
   5836 
   5837     if (matchOffsetBase(VMetadata, &BaseVar, &OffsetImm, &Reason)) {
   5838       continue;
   5839     }
   5840   } while (Reason);
   5841 
   5842   if (BaseVar == nullptr) {
   5843     // [OffsetReg{, LSL Shamt}{, #OffsetImm}] is not legal in ARM, so we have to
   5844     // legalize the addressing mode to [BaseReg, OffsetReg{, LSL Shamt}].
   5845     // Instead of a zeroed BaseReg, we initialize it with OffsetImm:
   5846     //
   5847     // [OffsetReg{, LSL Shamt}{, #OffsetImm}] ->
   5848     //     mov BaseReg, #OffsetImm
   5849     //     use of [BaseReg, OffsetReg{, LSL Shamt}]
   5850     //
   5851     const Type PointerType = getPointerType();
   5852     BaseVar = makeReg(PointerType);
   5853     Context.insert<InstAssign>(BaseVar, Ctx->getConstantInt32(OffsetImm));
   5854     OffsetImm = 0;
   5855   } else if (OffsetImm != 0) {
   5856     // ARM Ldr/Str instructions have limited range immediates. The formation
   5857     // loop above materialized an Immediate carelessly, so we ensure the
   5858     // generated offset is sane.
   5859     const int32_t PositiveOffset = OffsetImm > 0 ? OffsetImm : -OffsetImm;
   5860     const InstArithmetic::OpKind Op =
   5861         OffsetImm > 0 ? InstArithmetic::Add : InstArithmetic::Sub;
   5862 
   5863     if (!CanHaveImm || !isLegalMemOffset(Ty, OffsetImm) ||
   5864         OffsetReg != nullptr) {
   5865       if (OffsetReg == nullptr) {
   5866         // We formed a [Base, #const] addressing mode which is not encodable in
   5867         // ARM. There is little point in forming an address mode now if we don't
   5868         // have an offset. Effectively, we would end up with something like
   5869         //
   5870         // [Base, #const] -> add T, Base, #const
   5871         //                   use of [T]
   5872         //
   5873         // Which is exactly what we already have. So we just bite the bullet
   5874         // here and don't form any address mode.
   5875         return nullptr;
   5876       }
   5877       // We formed [Base, Offset {, LSL Amnt}, #const]. Oops. Legalize it to
   5878       //
   5879       // [Base, Offset, {LSL amount}, #const] ->
   5880       //      add T, Base, #const
   5881       //      use of [T, Offset {, LSL amount}]
   5882       const Type PointerType = getPointerType();
   5883       Variable *T = makeReg(PointerType);
   5884       Context.insert<InstArithmetic>(Op, T, BaseVar,
   5885                                      Ctx->getConstantInt32(PositiveOffset));
   5886       BaseVar = T;
   5887       OffsetImm = 0;
   5888     }
   5889   }
   5890 
   5891   assert(BaseVar != nullptr);
   5892   assert(OffsetImm == 0 || OffsetReg == nullptr);
   5893   assert(OffsetReg == nullptr || CanHaveIndex);
   5894   assert(OffsetImm < 0 ? (ValidImmMask & -OffsetImm) == -OffsetImm
   5895                        : (ValidImmMask & OffsetImm) == OffsetImm);
   5896 
   5897   if (OffsetReg != nullptr) {
   5898     Variable *OffsetR = makeReg(getPointerType());
   5899     Context.insert<InstAssign>(OffsetR, OffsetReg);
   5900     return OperandARM32Mem::create(Func, Ty, BaseVar, OffsetR, ShiftKind,
   5901                                    OffsetRegShamt);
   5902   }
   5903 
   5904   return OperandARM32Mem::create(
   5905       Func, Ty, BaseVar,
   5906       llvm::cast<ConstantInteger32>(Ctx->getConstantInt32(OffsetImm)));
   5907 }
   5908 
   5909 void TargetARM32::doAddressOptLoad() {
   5910   Inst *Instr = iteratorToInst(Context.getCur());
   5911   assert(llvm::isa<InstLoad>(Instr));
   5912   Variable *Dest = Instr->getDest();
   5913   Operand *Addr = Instr->getSrc(0);
   5914   if (OperandARM32Mem *Mem =
   5915           formAddressingMode(Dest->getType(), Func, Instr, Addr)) {
   5916     Instr->setDeleted();
   5917     Context.insert<InstLoad>(Dest, Mem);
   5918   }
   5919 }
   5920 
   5921 void TargetARM32::randomlyInsertNop(float Probability,
   5922                                     RandomNumberGenerator &RNG) {
   5923   RandomNumberGeneratorWrapper RNGW(RNG);
   5924   if (RNGW.getTrueWithProbability(Probability)) {
   5925     _nop();
   5926   }
   5927 }
   5928 
   5929 void TargetARM32::lowerPhi(const InstPhi * /*Instr*/) {
   5930   Func->setError("Phi found in regular instruction list");
   5931 }
   5932 
   5933 void TargetARM32::lowerRet(const InstRet *Instr) {
   5934   Variable *Reg = nullptr;
   5935   if (Instr->hasRetValue()) {
   5936     Operand *Src0 = Instr->getRetValue();
   5937     Type Ty = Src0->getType();
   5938     if (Ty == IceType_i64) {
   5939       Src0 = legalizeUndef(Src0);
   5940       Variable *R0 = legalizeToReg(loOperand(Src0), RegARM32::Reg_r0);
   5941       Variable *R1 = legalizeToReg(hiOperand(Src0), RegARM32::Reg_r1);
   5942       Reg = R0;
   5943       Context.insert<InstFakeUse>(R1);
   5944     } else if (Ty == IceType_f32) {
   5945       Variable *S0 = legalizeToReg(Src0, RegARM32::Reg_s0);
   5946       Reg = S0;
   5947     } else if (Ty == IceType_f64) {
   5948       Variable *D0 = legalizeToReg(Src0, RegARM32::Reg_d0);
   5949       Reg = D0;
   5950     } else if (isVectorType(Src0->getType())) {
   5951       Variable *Q0 = legalizeToReg(Src0, RegARM32::Reg_q0);
   5952       Reg = Q0;
   5953     } else {
   5954       Operand *Src0F = legalize(Src0, Legal_Reg | Legal_Flex);
   5955       Reg = makeReg(Src0F->getType(), RegARM32::Reg_r0);
   5956       _mov(Reg, Src0F, CondARM32::AL);
   5957     }
   5958   }
   5959   // Add a ret instruction even if sandboxing is enabled, because addEpilog
   5960   // explicitly looks for a ret instruction as a marker for where to insert the
   5961   // frame removal instructions. addEpilog is responsible for restoring the
   5962   // "lr" register as needed prior to this ret instruction.
   5963   _ret(getPhysicalRegister(RegARM32::Reg_lr), Reg);
   5964 
   5965   // Add a fake use of sp to make sure sp stays alive for the entire function.
   5966   // Otherwise post-call sp adjustments get dead-code eliminated.
   5967   // TODO: Are there more places where the fake use should be inserted? E.g.
   5968   // "void f(int n){while(1) g(n);}" may not have a ret instruction.
   5969   Variable *SP = getPhysicalRegister(RegARM32::Reg_sp);
   5970   Context.insert<InstFakeUse>(SP);
   5971 }
   5972 
   5973 void TargetARM32::lowerShuffleVector(const InstShuffleVector *Instr) {
   5974   auto *Dest = Instr->getDest();
   5975   const Type DestTy = Dest->getType();
   5976 
   5977   auto *T = makeReg(DestTy);
   5978   auto *Src0 = Instr->getSrc(0);
   5979   auto *Src1 = Instr->getSrc(1);
   5980   const SizeT NumElements = typeNumElements(DestTy);
   5981   const Type ElementType = typeElementType(DestTy);
   5982 
   5983   bool Replicate = true;
   5984   for (SizeT I = 1; Replicate && I < Instr->getNumIndexes(); ++I) {
   5985     if (Instr->getIndexValue(I) != Instr->getIndexValue(0)) {
   5986       Replicate = false;
   5987     }
   5988   }
   5989 
   5990   if (Replicate) {
   5991     Variable *Src0Var = legalizeToReg(Src0);
   5992     _vdup(T, Src0Var, Instr->getIndexValue(0));
   5993     _mov(Dest, T);
   5994     return;
   5995   }
   5996 
   5997   switch (DestTy) {
   5998   case IceType_v8i1:
   5999   case IceType_v8i16: {
   6000     static constexpr SizeT ExpectedNumElements = 8;
   6001     assert(ExpectedNumElements == Instr->getNumIndexes());
   6002     (void)ExpectedNumElements;
   6003 
   6004     if (Instr->indexesAre(0, 0, 1, 1, 2, 2, 3, 3)) {
   6005       Variable *Src0R = legalizeToReg(Src0);
   6006       _vzip(T, Src0R, Src0R);
   6007       _mov(Dest, T);
   6008       return;
   6009     }
   6010 
   6011     if (Instr->indexesAre(0, 8, 1, 9, 2, 10, 3, 11)) {
   6012       Variable *Src0R = legalizeToReg(Src0);
   6013       Variable *Src1R = legalizeToReg(Src1);
   6014       _vzip(T, Src0R, Src1R);
   6015       _mov(Dest, T);
   6016       return;
   6017     }
   6018 
   6019     if (Instr->indexesAre(0, 2, 4, 6, 0, 2, 4, 6)) {
   6020       Variable *Src0R = legalizeToReg(Src0);
   6021       _vqmovn2(T, Src0R, Src0R, false, false);
   6022       _mov(Dest, T);
   6023       return;
   6024     }
   6025   } break;
   6026   case IceType_v16i1:
   6027   case IceType_v16i8: {
   6028     static constexpr SizeT ExpectedNumElements = 16;
   6029     assert(ExpectedNumElements == Instr->getNumIndexes());
   6030     (void)ExpectedNumElements;
   6031 
   6032     if (Instr->indexesAre(0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7)) {
   6033       Variable *Src0R = legalizeToReg(Src0);
   6034       _vzip(T, Src0R, Src0R);
   6035       _mov(Dest, T);
   6036       return;
   6037     }
   6038 
   6039     if (Instr->indexesAre(0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7,
   6040                           23)) {
   6041       Variable *Src0R = legalizeToReg(Src0);
   6042       Variable *Src1R = legalizeToReg(Src1);
   6043       _vzip(T, Src0R, Src1R);
   6044       _mov(Dest, T);
   6045       return;
   6046     }
   6047   } break;
   6048   case IceType_v4i1:
   6049   case IceType_v4i32:
   6050   case IceType_v4f32: {
   6051     static constexpr SizeT ExpectedNumElements = 4;
   6052     assert(ExpectedNumElements == Instr->getNumIndexes());
   6053     (void)ExpectedNumElements;
   6054 
   6055     if (Instr->indexesAre(0, 0, 1, 1)) {
   6056       Variable *Src0R = legalizeToReg(Src0);
   6057       _vzip(T, Src0R, Src0R);
   6058       _mov(Dest, T);
   6059       return;
   6060     }
   6061 
   6062     if (Instr->indexesAre(0, 4, 1, 5)) {
   6063       Variable *Src0R = legalizeToReg(Src0);
   6064       Variable *Src1R = legalizeToReg(Src1);
   6065       _vzip(T, Src0R, Src1R);
   6066       _mov(Dest, T);
   6067       return;
   6068     }
   6069 
   6070     if (Instr->indexesAre(0, 1, 4, 5)) {
   6071       Variable *Src0R = legalizeToReg(Src0);
   6072       Variable *Src1R = legalizeToReg(Src1);
   6073       _vmovlh(T, Src0R, Src1R);
   6074       _mov(Dest, T);
   6075       return;
   6076     }
   6077 
   6078     if (Instr->indexesAre(2, 3, 2, 3)) {
   6079       Variable *Src0R = legalizeToReg(Src0);
   6080       _vmovhl(T, Src0R, Src0R);
   6081       _mov(Dest, T);
   6082       return;
   6083     }
   6084 
   6085     if (Instr->indexesAre(2, 3, 6, 7)) {
   6086       Variable *Src0R = legalizeToReg(Src0);
   6087       Variable *Src1R = legalizeToReg(Src1);
   6088       _vmovhl(T, Src1R, Src0R);
   6089       _mov(Dest, T);
   6090       return;
   6091     }
   6092   } break;
   6093   default:
   6094     break;
   6095     // TODO(jpp): figure out how to properly lower this without scalarization.
   6096   }
   6097 
   6098   // Unoptimized shuffle. Perform a series of inserts and extracts.
   6099   Context.insert<InstFakeDef>(T);
   6100   for (SizeT I = 0; I < Instr->getNumIndexes(); ++I) {
   6101     auto *Index = Instr->getIndex(I);
   6102     const SizeT Elem = Index->getValue();
   6103     auto *ExtElmt = makeReg(ElementType);
   6104     if (Elem < NumElements) {
   6105       lowerExtractElement(
   6106           InstExtractElement::create(Func, ExtElmt, Src0, Index));
   6107     } else {
   6108       lowerExtractElement(InstExtractElement::create(
   6109           Func, ExtElmt, Src1,
   6110           Ctx->getConstantInt32(Index->getValue() - NumElements)));
   6111     }
   6112     auto *NewT = makeReg(DestTy);
   6113     lowerInsertElement(InstInsertElement::create(Func, NewT, T, ExtElmt,
   6114                                                  Ctx->getConstantInt32(I)));
   6115     T = NewT;
   6116   }
   6117   _mov(Dest, T);
   6118 }
   6119 
   6120 void TargetARM32::lowerSelect(const InstSelect *Instr) {
   6121   Variable *Dest = Instr->getDest();
   6122   Type DestTy = Dest->getType();
   6123   Operand *SrcT = Instr->getTrueOperand();
   6124   Operand *SrcF = Instr->getFalseOperand();
   6125   Operand *Condition = Instr->getCondition();
   6126 
   6127   if (!isVectorType(DestTy)) {
   6128     lowerInt1ForSelect(Dest, Condition, legalizeUndef(SrcT),
   6129                        legalizeUndef(SrcF));
   6130     return;
   6131   }
   6132 
   6133   Type TType = DestTy;
   6134   switch (DestTy) {
   6135   default:
   6136     llvm::report_fatal_error("Unexpected type for vector select.");
   6137   case IceType_v4i1:
   6138     TType = IceType_v4i32;
   6139     break;
   6140   case IceType_v8i1:
   6141     TType = IceType_v8i16;
   6142     break;
   6143   case IceType_v16i1:
   6144     TType = IceType_v16i8;
   6145     break;
   6146   case IceType_v4f32:
   6147     TType = IceType_v4i32;
   6148     break;
   6149   case IceType_v4i32:
   6150   case IceType_v8i16:
   6151   case IceType_v16i8:
   6152     break;
   6153   }
   6154   auto *T = makeReg(TType);
   6155   lowerCast(InstCast::create(Func, InstCast::Sext, T, Condition));
   6156   auto *SrcTR = legalizeToReg(SrcT);
   6157   auto *SrcFR = legalizeToReg(SrcF);
   6158   _vbsl(T, SrcTR, SrcFR)->setDestRedefined();
   6159   _mov(Dest, T);
   6160 }
   6161 
   6162 void TargetARM32::lowerStore(const InstStore *Instr) {
   6163   Operand *Value = Instr->getData();
   6164   Operand *Addr = Instr->getAddr();
   6165   OperandARM32Mem *NewAddr = formMemoryOperand(Addr, Value->getType());
   6166   Type Ty = NewAddr->getType();
   6167 
   6168   if (Ty == IceType_i64) {
   6169     Value = legalizeUndef(Value);
   6170     Variable *ValueHi = legalizeToReg(hiOperand(Value));
   6171     Variable *ValueLo = legalizeToReg(loOperand(Value));
   6172     _str(ValueHi, llvm::cast<OperandARM32Mem>(hiOperand(NewAddr)));
   6173     _str(ValueLo, llvm::cast<OperandARM32Mem>(loOperand(NewAddr)));
   6174   } else {
   6175     Variable *ValueR = legalizeToReg(Value);
   6176     _str(ValueR, NewAddr);
   6177   }
   6178 }
   6179 
   6180 void TargetARM32::doAddressOptStore() {
   6181   Inst *Instr = iteratorToInst(Context.getCur());
   6182   assert(llvm::isa<InstStore>(Instr));
   6183   Operand *Src = Instr->getSrc(0);
   6184   Operand *Addr = Instr->getSrc(1);
   6185   if (OperandARM32Mem *Mem =
   6186           formAddressingMode(Src->getType(), Func, Instr, Addr)) {
   6187     Instr->setDeleted();
   6188     Context.insert<InstStore>(Src, Mem);
   6189   }
   6190 }
   6191 
   6192 void TargetARM32::lowerSwitch(const InstSwitch *Instr) {
   6193   // This implements the most naive possible lowering.
   6194   // cmp a,val[0]; jeq label[0]; cmp a,val[1]; jeq label[1]; ... jmp default
   6195   Operand *Src0 = Instr->getComparison();
   6196   SizeT NumCases = Instr->getNumCases();
   6197   if (Src0->getType() == IceType_i64) {
   6198     Src0 = legalizeUndef(Src0);
   6199     Variable *Src0Lo = legalizeToReg(loOperand(Src0));
   6200     Variable *Src0Hi = legalizeToReg(hiOperand(Src0));
   6201     for (SizeT I = 0; I < NumCases; ++I) {
   6202       Operand *ValueLo = Ctx->getConstantInt32(Instr->getValue(I));
   6203       Operand *ValueHi = Ctx->getConstantInt32(Instr->getValue(I) >> 32);
   6204       ValueLo = legalize(ValueLo, Legal_Reg | Legal_Flex);
   6205       ValueHi = legalize(ValueHi, Legal_Reg | Legal_Flex);
   6206       _cmp(Src0Lo, ValueLo);
   6207       _cmp(Src0Hi, ValueHi, CondARM32::EQ);
   6208       _br(Instr->getLabel(I), CondARM32::EQ);
   6209     }
   6210     _br(Instr->getLabelDefault());
   6211     return;
   6212   }
   6213 
   6214   Variable *Src0Var = legalizeToReg(Src0);
   6215   // If Src0 is not an i32, we left shift it -- see the icmp lowering for the
   6216   // reason.
   6217   assert(Src0Var->mustHaveReg());
   6218   const size_t ShiftAmt = 32 - getScalarIntBitWidth(Src0->getType());
   6219   assert(ShiftAmt < 32);
   6220   if (ShiftAmt > 0) {
   6221     Operand *ShAmtImm = shAmtImm(ShiftAmt);
   6222     Variable *T = makeReg(IceType_i32);
   6223     _lsl(T, Src0Var, ShAmtImm);
   6224     Src0Var = T;
   6225   }
   6226 
   6227   for (SizeT I = 0; I < NumCases; ++I) {
   6228     Operand *Value = Ctx->getConstantInt32(Instr->getValue(I) << ShiftAmt);
   6229     Value = legalize(Value, Legal_Reg | Legal_Flex);
   6230     _cmp(Src0Var, Value);
   6231     _br(Instr->getLabel(I), CondARM32::EQ);
   6232   }
   6233   _br(Instr->getLabelDefault());
   6234 }
   6235 
   6236 void TargetARM32::lowerBreakpoint(const InstBreakpoint *Instr) {
   6237   UnimplementedLoweringError(this, Instr);
   6238 }
   6239 
   6240 void TargetARM32::lowerUnreachable(const InstUnreachable * /*Instr*/) {
   6241   _trap();
   6242 }
   6243 
   6244 namespace {
   6245 // Returns whether Opnd needs the GOT address. Currently, ConstantRelocatables,
   6246 // and fp constants will need access to the GOT address.
   6247 bool operandNeedsGot(const Operand *Opnd) {
   6248   if (llvm::isa<ConstantRelocatable>(Opnd)) {
   6249     return true;
   6250   }
   6251 
   6252   if (llvm::isa<ConstantFloat>(Opnd)) {
   6253     uint32_t _;
   6254     return !OperandARM32FlexFpImm::canHoldImm(Opnd, &_);
   6255   }
   6256 
   6257   const auto *F64 = llvm::dyn_cast<ConstantDouble>(Opnd);
   6258   if (F64 != nullptr) {
   6259     uint32_t _;
   6260     return !OperandARM32FlexFpImm::canHoldImm(Opnd, &_) &&
   6261            !isFloatingPointZero(F64);
   6262   }
   6263 
   6264   return false;
   6265 }
   6266 
   6267 // Returns whether Phi needs the GOT address (which it does if any of its
   6268 // operands needs the GOT address.)
   6269 bool phiNeedsGot(const InstPhi *Phi) {
   6270   if (Phi->isDeleted()) {
   6271     return false;
   6272   }
   6273 
   6274   for (SizeT I = 0; I < Phi->getSrcSize(); ++I) {
   6275     if (operandNeedsGot(Phi->getSrc(I))) {
   6276       return true;
   6277     }
   6278   }
   6279 
   6280   return false;
   6281 }
   6282 
   6283 // Returns whether **any** phi in Node needs the GOT address.
   6284 bool anyPhiInNodeNeedsGot(CfgNode *Node) {
   6285   for (auto &Inst : Node->getPhis()) {
   6286     if (phiNeedsGot(llvm::cast<InstPhi>(&Inst))) {
   6287       return true;
   6288     }
   6289   }
   6290   return false;
   6291 }
   6292 
   6293 } // end of anonymous namespace
   6294 
   6295 void TargetARM32::prelowerPhis() {
   6296   CfgNode *Node = Context.getNode();
   6297 
   6298   if (SandboxingType == ST_Nonsfi) {
   6299     assert(GotPtr != nullptr);
   6300     if (anyPhiInNodeNeedsGot(Node)) {
   6301       // If any phi instruction needs the GOT address, we place a
   6302       //   fake-use GotPtr
   6303       // in Node to prevent the GotPtr's initialization from being dead code
   6304       // eliminated.
   6305       Node->getInsts().push_front(InstFakeUse::create(Func, GotPtr));
   6306     }
   6307   }
   6308 
   6309   PhiLowering::prelowerPhis32Bit(this, Node, Func);
   6310 }
   6311 
   6312 Variable *TargetARM32::makeVectorOfZeros(Type Ty, RegNumT RegNum) {
   6313   Variable *Reg = makeReg(Ty, RegNum);
   6314   Context.insert<InstFakeDef>(Reg);
   6315   assert(isVectorType(Ty));
   6316   _veor(Reg, Reg, Reg);
   6317   return Reg;
   6318 }
   6319 
   6320 // Helper for legalize() to emit the right code to lower an operand to a
   6321 // register of the appropriate type.
   6322 Variable *TargetARM32::copyToReg(Operand *Src, RegNumT RegNum) {
   6323   Type Ty = Src->getType();
   6324   Variable *Reg = makeReg(Ty, RegNum);
   6325   if (auto *Mem = llvm::dyn_cast<OperandARM32Mem>(Src)) {
   6326     _ldr(Reg, Mem);
   6327   } else {
   6328     _mov(Reg, Src);
   6329   }
   6330   return Reg;
   6331 }
   6332 
   6333 // TODO(jpp): remove unneeded else clauses in legalize.
   6334 Operand *TargetARM32::legalize(Operand *From, LegalMask Allowed,
   6335                                RegNumT RegNum) {
   6336   Type Ty = From->getType();
   6337   // Assert that a physical register is allowed. To date, all calls to
   6338   // legalize() allow a physical register. Legal_Flex converts registers to the
   6339   // right type OperandARM32FlexReg as needed.
   6340   assert(Allowed & Legal_Reg);
   6341 
   6342   // Copied ipsis literis from TargetX86Base<Machine>.
   6343   if (RegNum.hasNoValue()) {
   6344     if (Variable *Subst = getContext().availabilityGet(From)) {
   6345       // At this point we know there is a potential substitution available.
   6346       if (!Subst->isRematerializable() && Subst->mustHaveReg() &&
   6347           !Subst->hasReg()) {
   6348         // At this point we know the substitution will have a register.
   6349         if (From->getType() == Subst->getType()) {
   6350           // At this point we know the substitution's register is compatible.
   6351           return Subst;
   6352         }
   6353       }
   6354     }
   6355   }
   6356 
   6357   // Go through the various types of operands: OperandARM32Mem,
   6358   // OperandARM32Flex, Constant, and Variable. Given the above assertion, if
   6359   // type of operand is not legal (e.g., OperandARM32Mem and !Legal_Mem), we
   6360   // can always copy to a register.
   6361   if (auto *Mem = llvm::dyn_cast<OperandARM32Mem>(From)) {
   6362     // Before doing anything with a Mem operand, we need to ensure that the
   6363     // Base and Index components are in physical registers.
   6364     Variable *Base = Mem->getBase();
   6365     Variable *Index = Mem->getIndex();
   6366     ConstantInteger32 *Offset = Mem->getOffset();
   6367     assert(Index == nullptr || Offset == nullptr);
   6368     Variable *RegBase = nullptr;
   6369     Variable *RegIndex = nullptr;
   6370     assert(Base);
   6371     RegBase = llvm::cast<Variable>(
   6372         legalize(Base, Legal_Reg | Legal_Rematerializable));
   6373     assert(Ty < MemTraitsSize);
   6374     if (Index) {
   6375       assert(Offset == nullptr);
   6376       assert(MemTraits[Ty].CanHaveIndex);
   6377       RegIndex = legalizeToReg(Index);
   6378     }
   6379     if (Offset && Offset->getValue() != 0) {
   6380       assert(Index == nullptr);
   6381       static constexpr bool ZeroExt = false;
   6382       assert(MemTraits[Ty].CanHaveImm);
   6383       if (!OperandARM32Mem::canHoldOffset(Ty, ZeroExt, Offset->getValue())) {
   6384         llvm::report_fatal_error("Invalid memory offset.");
   6385       }
   6386     }
   6387 
   6388     // Create a new operand if there was a change.
   6389     if (Base != RegBase || Index != RegIndex) {
   6390       // There is only a reg +/- reg or reg + imm form.
   6391       // Figure out which to re-create.
   6392       if (RegIndex) {
   6393         Mem = OperandARM32Mem::create(Func, Ty, RegBase, RegIndex,
   6394                                       Mem->getShiftOp(), Mem->getShiftAmt(),
   6395                                       Mem->getAddrMode());
   6396       } else {
   6397         Mem = OperandARM32Mem::create(Func, Ty, RegBase, Offset,
   6398                                       Mem->getAddrMode());
   6399       }
   6400     }
   6401     if (Allowed & Legal_Mem) {
   6402       From = Mem;
   6403     } else {
   6404       Variable *Reg = makeReg(Ty, RegNum);
   6405       _ldr(Reg, Mem);
   6406       From = Reg;
   6407     }
   6408     return From;
   6409   }
   6410 
   6411   if (auto *Flex = llvm::dyn_cast<OperandARM32Flex>(From)) {
   6412     if (!(Allowed & Legal_Flex)) {
   6413       if (auto *FlexReg = llvm::dyn_cast<OperandARM32FlexReg>(Flex)) {
   6414         if (FlexReg->getShiftOp() == OperandARM32::kNoShift) {
   6415           From = FlexReg->getReg();
   6416           // Fall through and let From be checked as a Variable below, where it
   6417           // may or may not need a register.
   6418         } else {
   6419           return copyToReg(Flex, RegNum);
   6420         }
   6421       } else {
   6422         return copyToReg(Flex, RegNum);
   6423       }
   6424     } else {
   6425       return From;
   6426     }
   6427   }
   6428 
   6429   if (llvm::isa<Constant>(From)) {
   6430     if (llvm::isa<ConstantUndef>(From)) {
   6431       From = legalizeUndef(From, RegNum);
   6432       if (isVectorType(Ty))
   6433         return From;
   6434     }
   6435     // There should be no constants of vector type (other than undef).
   6436     assert(!isVectorType(Ty));
   6437     if (auto *C32 = llvm::dyn_cast<ConstantInteger32>(From)) {
   6438       uint32_t RotateAmt;
   6439       uint32_t Immed_8;
   6440       uint32_t Value = static_cast<uint32_t>(C32->getValue());
   6441       if (OperandARM32FlexImm::canHoldImm(Value, &RotateAmt, &Immed_8)) {
   6442         // The immediate can be encoded as a Flex immediate. We may return the
   6443         // Flex operand if the caller has Allow'ed it.
   6444         auto *OpF = OperandARM32FlexImm::create(Func, Ty, Immed_8, RotateAmt);
   6445         const bool CanBeFlex = Allowed & Legal_Flex;
   6446         if (CanBeFlex)
   6447           return OpF;
   6448         return copyToReg(OpF, RegNum);
   6449       } else if (OperandARM32FlexImm::canHoldImm(~Value, &RotateAmt,
   6450                                                  &Immed_8)) {
   6451         // Even though the immediate can't be encoded as a Flex operand, its
   6452         // inverted bit pattern can, thus we use ARM's mvn to load the 32-bit
   6453         // constant with a single instruction.
   6454         auto *InvOpF =
   6455             OperandARM32FlexImm::create(Func, Ty, Immed_8, RotateAmt);
   6456         Variable *Reg = makeReg(Ty, RegNum);
   6457         _mvn(Reg, InvOpF);
   6458         return Reg;
   6459       } else {
   6460         // Do a movw/movt to a register.
   6461         Variable *Reg = makeReg(Ty, RegNum);
   6462         uint32_t UpperBits = (Value >> 16) & 0xFFFF;
   6463         _movw(Reg,
   6464               UpperBits != 0 ? Ctx->getConstantInt32(Value & 0xFFFF) : C32);
   6465         if (UpperBits != 0) {
   6466           _movt(Reg, Ctx->getConstantInt32(UpperBits));
   6467         }
   6468         return Reg;
   6469       }
   6470     } else if (auto *C = llvm::dyn_cast<ConstantRelocatable>(From)) {
   6471       Variable *Reg = makeReg(Ty, RegNum);
   6472       if (SandboxingType != ST_Nonsfi) {
   6473         _movw(Reg, C);
   6474         _movt(Reg, C);
   6475       } else {
   6476         auto *GotAddr = legalizeToReg(GotPtr);
   6477         GlobalString CGotoffName = createGotoffRelocation(C);
   6478         loadNamedConstantRelocatablePIC(
   6479             CGotoffName, Reg, [this, Reg](Variable *PC) {
   6480               _ldr(Reg, OperandARM32Mem::create(Func, IceType_i32, PC, Reg));
   6481             });
   6482         _add(Reg, GotAddr, Reg);
   6483       }
   6484       return Reg;
   6485     } else {
   6486       assert(isScalarFloatingType(Ty));
   6487       uint32_t ModifiedImm;
   6488       if (OperandARM32FlexFpImm::canHoldImm(From, &ModifiedImm)) {
   6489         Variable *T = makeReg(Ty, RegNum);
   6490         _mov(T,
   6491              OperandARM32FlexFpImm::create(Func, From->getType(), ModifiedImm));
   6492         return T;
   6493       }
   6494 
   6495       if (Ty == IceType_f64 && isFloatingPointZero(From)) {
   6496         // Use T = T ^ T to load a 64-bit fp zero. This does not work for f32
   6497         // because ARM does not have a veor instruction with S registers.
   6498         Variable *T = makeReg(IceType_f64, RegNum);
   6499         Context.insert<InstFakeDef>(T);
   6500         _veor(T, T, T);
   6501         return T;
   6502       }
   6503 
   6504       // Load floats/doubles from literal pool.
   6505       auto *CFrom = llvm::cast<Constant>(From);
   6506       assert(CFrom->getShouldBePooled());
   6507       Constant *Offset = Ctx->getConstantSym(0, CFrom->getLabelName());
   6508       Variable *BaseReg = nullptr;
   6509       if (SandboxingType == ST_Nonsfi) {
   6510         // vldr does not support the [base, index] addressing mode, so we need
   6511         // to legalize Offset to a register. Otherwise, we could simply
   6512         //   vldr dest, [got, reg(Offset)]
   6513         BaseReg = legalizeToReg(Offset);
   6514       } else {
   6515         BaseReg = makeReg(getPointerType());
   6516         _movw(BaseReg, Offset);
   6517         _movt(BaseReg, Offset);
   6518       }
   6519       From = formMemoryOperand(BaseReg, Ty);
   6520       return copyToReg(From, RegNum);
   6521     }
   6522   }
   6523 
   6524   if (auto *Var = llvm::dyn_cast<Variable>(From)) {
   6525     if (Var->isRematerializable()) {
   6526       if (Allowed & Legal_Rematerializable) {
   6527         return From;
   6528       }
   6529 
   6530       Variable *T = makeReg(Var->getType(), RegNum);
   6531       _mov(T, Var);
   6532       return T;
   6533     }
   6534     // Check if the variable is guaranteed a physical register. This can happen
   6535     // either when the variable is pre-colored or when it is assigned infinite
   6536     // weight.
   6537     bool MustHaveRegister = (Var->hasReg() || Var->mustHaveReg());
   6538     // We need a new physical register for the operand if:
   6539     //   Mem is not allowed and Var isn't guaranteed a physical
   6540     //   register, or
   6541     //   RegNum is required and Var->getRegNum() doesn't match.
   6542     if ((!(Allowed & Legal_Mem) && !MustHaveRegister) ||
   6543         (RegNum.hasValue() && (RegNum != Var->getRegNum()))) {
   6544       From = copyToReg(From, RegNum);
   6545     }
   6546     return From;
   6547   }
   6548   llvm::report_fatal_error("Unhandled operand kind in legalize()");
   6549 
   6550   return From;
   6551 }
   6552 
   6553 /// Provide a trivial wrapper to legalize() for this common usage.
   6554 Variable *TargetARM32::legalizeToReg(Operand *From, RegNumT RegNum) {
   6555   return llvm::cast<Variable>(legalize(From, Legal_Reg, RegNum));
   6556 }
   6557 
   6558 /// Legalize undef values to concrete values.
   6559 Operand *TargetARM32::legalizeUndef(Operand *From, RegNumT RegNum) {
   6560   Type Ty = From->getType();
   6561   if (llvm::isa<ConstantUndef>(From)) {
   6562     // Lower undefs to zero. Another option is to lower undefs to an
   6563     // uninitialized register; however, using an uninitialized register results
   6564     // in less predictable code.
   6565     //
   6566     // If in the future the implementation is changed to lower undef values to
   6567     // uninitialized registers, a FakeDef will be needed:
   6568     // Context.insert(InstFakeDef::create(Func, Reg)); This is in order to
   6569     // ensure that the live range of Reg is not overestimated. If the constant
   6570     // being lowered is a 64 bit value, then the result should be split and the
   6571     // lo and hi components will need to go in uninitialized registers.
   6572     if (isVectorType(Ty))
   6573       return makeVectorOfZeros(Ty, RegNum);
   6574     return Ctx->getConstantZero(Ty);
   6575   }
   6576   return From;
   6577 }
   6578 
   6579 OperandARM32Mem *TargetARM32::formMemoryOperand(Operand *Operand, Type Ty) {
   6580   auto *Mem = llvm::dyn_cast<OperandARM32Mem>(Operand);
   6581   // It may be the case that address mode optimization already creates an
   6582   // OperandARM32Mem, so in that case it wouldn't need another level of
   6583   // transformation.
   6584   if (Mem) {
   6585     return llvm::cast<OperandARM32Mem>(legalize(Mem));
   6586   }
   6587   // If we didn't do address mode optimization, then we only have a
   6588   // base/offset to work with. ARM always requires a base register, so
   6589   // just use that to hold the operand.
   6590   auto *Base = llvm::cast<Variable>(
   6591       legalize(Operand, Legal_Reg | Legal_Rematerializable));
   6592   return OperandARM32Mem::create(
   6593       Func, Ty, Base,
   6594       llvm::cast<ConstantInteger32>(Ctx->getConstantZero(IceType_i32)));
   6595 }
   6596 
   6597 Variable64On32 *TargetARM32::makeI64RegPair() {
   6598   Variable64On32 *Reg =
   6599       llvm::cast<Variable64On32>(Func->makeVariable(IceType_i64));
   6600   Reg->setMustHaveReg();
   6601   Reg->initHiLo(Func);
   6602   Reg->getLo()->setMustNotHaveReg();
   6603   Reg->getHi()->setMustNotHaveReg();
   6604   return Reg;
   6605 }
   6606 
   6607 Variable *TargetARM32::makeReg(Type Type, RegNumT RegNum) {
   6608   // There aren't any 64-bit integer registers for ARM32.
   6609   assert(Type != IceType_i64);
   6610   assert(AllowTemporaryWithNoReg || RegNum.hasValue());
   6611   Variable *Reg = Func->makeVariable(Type);
   6612   if (RegNum.hasValue())
   6613     Reg->setRegNum(RegNum);
   6614   else
   6615     Reg->setMustHaveReg();
   6616   return Reg;
   6617 }
   6618 
   6619 void TargetARM32::alignRegisterPow2(Variable *Reg, uint32_t Align,
   6620                                     RegNumT TmpRegNum) {
   6621   assert(llvm::isPowerOf2_32(Align));
   6622   uint32_t RotateAmt;
   6623   uint32_t Immed_8;
   6624   Operand *Mask;
   6625   // Use AND or BIC to mask off the bits, depending on which immediate fits (if
   6626   // it fits at all). Assume Align is usually small, in which case BIC works
   6627   // better. Thus, this rounds down to the alignment.
   6628   if (OperandARM32FlexImm::canHoldImm(Align - 1, &RotateAmt, &Immed_8)) {
   6629     Mask = legalize(Ctx->getConstantInt32(Align - 1), Legal_Reg | Legal_Flex,
   6630                     TmpRegNum);
   6631     _bic(Reg, Reg, Mask);
   6632   } else {
   6633     Mask = legalize(Ctx->getConstantInt32(-Align), Legal_Reg | Legal_Flex,
   6634                     TmpRegNum);
   6635     _and(Reg, Reg, Mask);
   6636   }
   6637 }
   6638 
   6639 void TargetARM32::postLower() {
   6640   if (Func->getOptLevel() == Opt_m1)
   6641     return;
   6642   markRedefinitions();
   6643   Context.availabilityUpdate();
   6644 }
   6645 
   6646 void TargetARM32::makeRandomRegisterPermutation(
   6647     llvm::SmallVectorImpl<RegNumT> &Permutation,
   6648     const SmallBitVector &ExcludeRegisters, uint64_t Salt) const {
   6649   (void)Permutation;
   6650   (void)ExcludeRegisters;
   6651   (void)Salt;
   6652   UnimplementedError(getFlags());
   6653 }
   6654 
   6655 void TargetARM32::emit(const ConstantInteger32 *C) const {
   6656   if (!BuildDefs::dump())
   6657     return;
   6658   Ostream &Str = Ctx->getStrEmit();
   6659   Str << "#" << C->getValue();
   6660 }
   6661 
   6662 void TargetARM32::emit(const ConstantInteger64 *) const {
   6663   llvm::report_fatal_error("Not expecting to emit 64-bit integers");
   6664 }
   6665 
   6666 void TargetARM32::emit(const ConstantFloat *C) const {
   6667   (void)C;
   6668   UnimplementedError(getFlags());
   6669 }
   6670 
   6671 void TargetARM32::emit(const ConstantDouble *C) const {
   6672   (void)C;
   6673   UnimplementedError(getFlags());
   6674 }
   6675 
   6676 void TargetARM32::emit(const ConstantUndef *) const {
   6677   llvm::report_fatal_error("undef value encountered by emitter.");
   6678 }
   6679 
   6680 void TargetARM32::emit(const ConstantRelocatable *C) const {
   6681   if (!BuildDefs::dump())
   6682     return;
   6683   Ostream &Str = Ctx->getStrEmit();
   6684   Str << "#";
   6685   emitWithoutPrefix(C);
   6686 }
   6687 
   6688 void TargetARM32::lowerInt1ForSelect(Variable *Dest, Operand *Boolean,
   6689                                      Operand *TrueValue, Operand *FalseValue) {
   6690   Operand *_1 = legalize(Ctx->getConstantInt1(1), Legal_Reg | Legal_Flex);
   6691 
   6692   assert(Boolean->getType() == IceType_i1);
   6693 
   6694   bool NeedsAnd1 = false;
   6695   if (TrueValue->getType() == IceType_i1) {
   6696     assert(FalseValue->getType() == IceType_i1);
   6697 
   6698     Variable *TrueValueV = Func->makeVariable(IceType_i1);
   6699     SafeBoolChain Src0Safe = lowerInt1(TrueValueV, TrueValue);
   6700     TrueValue = TrueValueV;
   6701 
   6702     Variable *FalseValueV = Func->makeVariable(IceType_i1);
   6703     SafeBoolChain Src1Safe = lowerInt1(FalseValueV, FalseValue);
   6704     FalseValue = FalseValueV;
   6705 
   6706     NeedsAnd1 = Src0Safe == SBC_No || Src1Safe == SBC_No;
   6707   }
   6708 
   6709   Variable *DestLo = (Dest->getType() == IceType_i64)
   6710                          ? llvm::cast<Variable>(loOperand(Dest))
   6711                          : Dest;
   6712   Variable *DestHi = (Dest->getType() == IceType_i64)
   6713                          ? llvm::cast<Variable>(hiOperand(Dest))
   6714                          : nullptr;
   6715   Operand *FalseValueLo = (FalseValue->getType() == IceType_i64)
   6716                               ? loOperand(FalseValue)
   6717                               : FalseValue;
   6718   Operand *FalseValueHi =
   6719       (FalseValue->getType() == IceType_i64) ? hiOperand(FalseValue) : nullptr;
   6720 
   6721   Operand *TrueValueLo =
   6722       (TrueValue->getType() == IceType_i64) ? loOperand(TrueValue) : TrueValue;
   6723   Operand *TrueValueHi =
   6724       (TrueValue->getType() == IceType_i64) ? hiOperand(TrueValue) : nullptr;
   6725 
   6726   Variable *T_Lo = makeReg(DestLo->getType());
   6727   Variable *T_Hi = (DestHi == nullptr) ? nullptr : makeReg(DestHi->getType());
   6728 
   6729   _mov(T_Lo, legalize(FalseValueLo, Legal_Reg | Legal_Flex));
   6730   if (DestHi) {
   6731     _mov(T_Hi, legalize(FalseValueHi, Legal_Reg | Legal_Flex));
   6732   }
   6733 
   6734   CondWhenTrue Cond(CondARM32::kNone);
   6735   // FlagsWereSet is used to determine wether Boolean was folded or not. If not,
   6736   // add an explicit _tst instruction below.
   6737   bool FlagsWereSet = false;
   6738   if (const Inst *Producer = Computations.getProducerOf(Boolean)) {
   6739     switch (Producer->getKind()) {
   6740     default:
   6741       llvm::report_fatal_error("Unexpected producer.");
   6742     case Inst::Icmp: {
   6743       Cond = lowerIcmpCond(llvm::cast<InstIcmp>(Producer));
   6744       FlagsWereSet = true;
   6745     } break;
   6746     case Inst::Fcmp: {
   6747       Cond = lowerFcmpCond(llvm::cast<InstFcmp>(Producer));
   6748       FlagsWereSet = true;
   6749     } break;
   6750     case Inst::Cast: {
   6751       const auto *CastProducer = llvm::cast<InstCast>(Producer);
   6752       assert(CastProducer->getCastKind() == InstCast::Trunc);
   6753       Boolean = CastProducer->getSrc(0);
   6754       // No flags were set, so a _tst(Src, 1) will be emitted below. Don't
   6755       // bother legalizing Src to a Reg because it will be legalized before
   6756       // emitting the tst instruction.
   6757       FlagsWereSet = false;
   6758     } break;
   6759     case Inst::Arithmetic: {
   6760       // This is a special case: we eagerly assumed Producer could be folded,
   6761       // but in reality, it can't. No reason to panic: we just lower it using
   6762       // the regular lowerArithmetic helper.
   6763       const auto *ArithProducer = llvm::cast<InstArithmetic>(Producer);
   6764       lowerArithmetic(ArithProducer);
   6765       Boolean = ArithProducer->getDest();
   6766       // No flags were set, so a _tst(Dest, 1) will be emitted below. Don't
   6767       // bother legalizing Dest to a Reg because it will be legalized before
   6768       // emitting  the tst instruction.
   6769       FlagsWereSet = false;
   6770     } break;
   6771     }
   6772   }
   6773 
   6774   if (!FlagsWereSet) {
   6775     // No flags have been set, so emit a tst Boolean, 1.
   6776     Variable *Src = legalizeToReg(Boolean);
   6777     _tst(Src, _1);
   6778     Cond = CondWhenTrue(CondARM32::NE); // i.e., CondARM32::NotZero.
   6779   }
   6780 
   6781   if (Cond.WhenTrue0 == CondARM32::kNone) {
   6782     assert(Cond.WhenTrue1 == CondARM32::kNone);
   6783   } else {
   6784     _mov_redefined(T_Lo, legalize(TrueValueLo, Legal_Reg | Legal_Flex),
   6785                    Cond.WhenTrue0);
   6786     if (DestHi) {
   6787       _mov_redefined(T_Hi, legalize(TrueValueHi, Legal_Reg | Legal_Flex),
   6788                      Cond.WhenTrue0);
   6789     }
   6790   }
   6791 
   6792   if (Cond.WhenTrue1 != CondARM32::kNone) {
   6793     _mov_redefined(T_Lo, legalize(TrueValueLo, Legal_Reg | Legal_Flex),
   6794                    Cond.WhenTrue1);
   6795     if (DestHi) {
   6796       _mov_redefined(T_Hi, legalize(TrueValueHi, Legal_Reg | Legal_Flex),
   6797                      Cond.WhenTrue1);
   6798     }
   6799   }
   6800 
   6801   if (NeedsAnd1) {
   6802     // We lowered something that is unsafe (i.e., can't provably be zero or
   6803     // one). Truncate the result.
   6804     _and(T_Lo, T_Lo, _1);
   6805   }
   6806 
   6807   _mov(DestLo, T_Lo);
   6808   if (DestHi) {
   6809     _mov(DestHi, T_Hi);
   6810   }
   6811 }
   6812 
   6813 TargetARM32::SafeBoolChain TargetARM32::lowerInt1(Variable *Dest,
   6814                                                   Operand *Boolean) {
   6815   assert(Boolean->getType() == IceType_i1);
   6816   Variable *T = makeReg(IceType_i1);
   6817   Operand *_0 =
   6818       legalize(Ctx->getConstantZero(IceType_i1), Legal_Reg | Legal_Flex);
   6819   Operand *_1 = legalize(Ctx->getConstantInt1(1), Legal_Reg | Legal_Flex);
   6820 
   6821   SafeBoolChain Safe = SBC_Yes;
   6822   if (const Inst *Producer = Computations.getProducerOf(Boolean)) {
   6823     switch (Producer->getKind()) {
   6824     default:
   6825       llvm::report_fatal_error("Unexpected producer.");
   6826     case Inst::Icmp: {
   6827       _mov(T, _0);
   6828       CondWhenTrue Cond = lowerIcmpCond(llvm::cast<InstIcmp>(Producer));
   6829       assert(Cond.WhenTrue0 != CondARM32::AL);
   6830       assert(Cond.WhenTrue0 != CondARM32::kNone);
   6831       assert(Cond.WhenTrue1 == CondARM32::kNone);
   6832       _mov_redefined(T, _1, Cond.WhenTrue0);
   6833     } break;
   6834     case Inst::Fcmp: {
   6835       _mov(T, _0);
   6836       Inst *MovZero = Context.getLastInserted();
   6837       CondWhenTrue Cond = lowerFcmpCond(llvm::cast<InstFcmp>(Producer));
   6838       if (Cond.WhenTrue0 == CondARM32::AL) {
   6839         assert(Cond.WhenTrue1 == CondARM32::kNone);
   6840         MovZero->setDeleted();
   6841         _mov(T, _1);
   6842       } else if (Cond.WhenTrue0 != CondARM32::kNone) {
   6843         _mov_redefined(T, _1, Cond.WhenTrue0);
   6844       }
   6845       if (Cond.WhenTrue1 != CondARM32::kNone) {
   6846         assert(Cond.WhenTrue0 != CondARM32::kNone);
   6847         assert(Cond.WhenTrue0 != CondARM32::AL);
   6848         _mov_redefined(T, _1, Cond.WhenTrue1);
   6849       }
   6850     } break;
   6851     case Inst::Cast: {
   6852       const auto *CastProducer = llvm::cast<InstCast>(Producer);
   6853       assert(CastProducer->getCastKind() == InstCast::Trunc);
   6854       Operand *Src = CastProducer->getSrc(0);
   6855       if (Src->getType() == IceType_i64)
   6856         Src = loOperand(Src);
   6857       _mov(T, legalize(Src, Legal_Reg | Legal_Flex));
   6858       Safe = SBC_No;
   6859     } break;
   6860     case Inst::Arithmetic: {
   6861       const auto *ArithProducer = llvm::cast<InstArithmetic>(Producer);
   6862       Safe = lowerInt1Arithmetic(ArithProducer);
   6863       _mov(T, ArithProducer->getDest());
   6864     } break;
   6865     }
   6866   } else {
   6867     _mov(T, legalize(Boolean, Legal_Reg | Legal_Flex));
   6868   }
   6869 
   6870   _mov(Dest, T);
   6871   return Safe;
   6872 }
   6873 
   6874 namespace {
   6875 namespace BoolFolding {
   6876 bool shouldTrackProducer(const Inst &Instr) {
   6877   switch (Instr.getKind()) {
   6878   default:
   6879     return false;
   6880   case Inst::Icmp:
   6881   case Inst::Fcmp:
   6882     return true;
   6883   case Inst::Cast: {
   6884     switch (llvm::cast<InstCast>(&Instr)->getCastKind()) {
   6885     default:
   6886       return false;
   6887     case InstCast::Trunc:
   6888       return true;
   6889     }
   6890   }
   6891   case Inst::Arithmetic: {
   6892     switch (llvm::cast<InstArithmetic>(&Instr)->getOp()) {
   6893     default:
   6894       return false;
   6895     case InstArithmetic::And:
   6896     case InstArithmetic::Or:
   6897       return true;
   6898     }
   6899   }
   6900   }
   6901 }
   6902 
   6903 bool isValidConsumer(const Inst &Instr) {
   6904   switch (Instr.getKind()) {
   6905   default:
   6906     return false;
   6907   case Inst::Br:
   6908     return true;
   6909   case Inst::Select:
   6910     return !isVectorType(Instr.getDest()->getType());
   6911   case Inst::Cast: {
   6912     switch (llvm::cast<InstCast>(&Instr)->getCastKind()) {
   6913     default:
   6914       return false;
   6915     case InstCast::Sext:
   6916       return !isVectorType(Instr.getDest()->getType());
   6917     case InstCast::Zext:
   6918       return !isVectorType(Instr.getDest()->getType());
   6919     }
   6920   }
   6921   case Inst::Arithmetic: {
   6922     switch (llvm::cast<InstArithmetic>(&Instr)->getOp()) {
   6923     default:
   6924       return false;
   6925     case InstArithmetic::And:
   6926       return !isVectorType(Instr.getDest()->getType());
   6927     case InstArithmetic::Or:
   6928       return !isVectorType(Instr.getDest()->getType());
   6929     }
   6930   }
   6931   }
   6932 }
   6933 } // end of namespace BoolFolding
   6934 
   6935 namespace FpFolding {
   6936 bool shouldTrackProducer(const Inst &Instr) {
   6937   switch (Instr.getKind()) {
   6938   default:
   6939     return false;
   6940   case Inst::Arithmetic: {
   6941     switch (llvm::cast<InstArithmetic>(&Instr)->getOp()) {
   6942     default:
   6943       return false;
   6944     case InstArithmetic::Fmul:
   6945       return true;
   6946     }
   6947   }
   6948   }
   6949 }
   6950 
   6951 bool isValidConsumer(const Inst &Instr) {
   6952   switch (Instr.getKind()) {
   6953   default:
   6954     return false;
   6955   case Inst::Arithmetic: {
   6956     switch (llvm::cast<InstArithmetic>(&Instr)->getOp()) {
   6957     default:
   6958       return false;
   6959     case InstArithmetic::Fadd:
   6960     case InstArithmetic::Fsub:
   6961       return true;
   6962     }
   6963   }
   6964   }
   6965 }
   6966 } // end of namespace FpFolding
   6967 
   6968 namespace IntFolding {
   6969 bool shouldTrackProducer(const Inst &Instr) {
   6970   switch (Instr.getKind()) {
   6971   default:
   6972     return false;
   6973   case Inst::Arithmetic: {
   6974     switch (llvm::cast<InstArithmetic>(&Instr)->getOp()) {
   6975     default:
   6976       return false;
   6977     case InstArithmetic::Mul:
   6978       return true;
   6979     }
   6980   }
   6981   }
   6982 }
   6983 
   6984 bool isValidConsumer(const Inst &Instr) {
   6985   switch (Instr.getKind()) {
   6986   default:
   6987     return false;
   6988   case Inst::Arithmetic: {
   6989     switch (llvm::cast<InstArithmetic>(&Instr)->getOp()) {
   6990     default:
   6991       return false;
   6992     case InstArithmetic::Add:
   6993     case InstArithmetic::Sub:
   6994       return true;
   6995     }
   6996   }
   6997   }
   6998 }
   6999 } // end of namespace FpFolding
   7000 } // end of anonymous namespace
   7001 
   7002 void TargetARM32::ComputationTracker::recordProducers(CfgNode *Node) {
   7003   for (Inst &Instr : Node->getInsts()) {
   7004     // Check whether Instr is a valid producer.
   7005     Variable *Dest = Instr.getDest();
   7006     if (!Instr.isDeleted() // only consider non-deleted instructions; and
   7007         && Dest            // only instructions with an actual dest var; and
   7008         && Dest->getType() == IceType_i1 // only bool-type dest vars; and
   7009         && BoolFolding::shouldTrackProducer(Instr)) { // white-listed instr.
   7010       KnownComputations.emplace(Dest->getIndex(),
   7011                                 ComputationEntry(&Instr, IceType_i1));
   7012     }
   7013     if (!Instr.isDeleted() // only consider non-deleted instructions; and
   7014         && Dest            // only instructions with an actual dest var; and
   7015         && isScalarFloatingType(Dest->getType()) // fp-type only dest vars; and
   7016         && FpFolding::shouldTrackProducer(Instr)) { // white-listed instr.
   7017       KnownComputations.emplace(Dest->getIndex(),
   7018                                 ComputationEntry(&Instr, Dest->getType()));
   7019     }
   7020     if (!Instr.isDeleted() // only consider non-deleted instructions; and
   7021         && Dest            // only instructions with an actual dest var; and
   7022         && Dest->getType() == IceType_i32            // i32 only dest vars; and
   7023         && IntFolding::shouldTrackProducer(Instr)) { // white-listed instr.
   7024       KnownComputations.emplace(Dest->getIndex(),
   7025                                 ComputationEntry(&Instr, IceType_i32));
   7026     }
   7027     // Check each src variable against the map.
   7028     FOREACH_VAR_IN_INST(Var, Instr) {
   7029       SizeT VarNum = Var->getIndex();
   7030       auto ComputationIter = KnownComputations.find(VarNum);
   7031       if (ComputationIter == KnownComputations.end()) {
   7032         continue;
   7033       }
   7034 
   7035       ++ComputationIter->second.NumUses;
   7036       switch (ComputationIter->second.ComputationType) {
   7037       default:
   7038         KnownComputations.erase(VarNum);
   7039         continue;
   7040       case IceType_i1:
   7041         if (!BoolFolding::isValidConsumer(Instr)) {
   7042           KnownComputations.erase(VarNum);
   7043           continue;
   7044         }
   7045         break;
   7046       case IceType_i32:
   7047         if (IndexOfVarInInst(Var) != 1 || !IntFolding::isValidConsumer(Instr)) {
   7048           KnownComputations.erase(VarNum);
   7049           continue;
   7050         }
   7051         break;
   7052       case IceType_f32:
   7053       case IceType_f64:
   7054         if (IndexOfVarInInst(Var) != 1 || !FpFolding::isValidConsumer(Instr)) {
   7055           KnownComputations.erase(VarNum);
   7056           continue;
   7057         }
   7058         break;
   7059       }
   7060 
   7061       if (Instr.isLastUse(Var)) {
   7062         ComputationIter->second.IsLiveOut = false;
   7063       }
   7064     }
   7065   }
   7066 
   7067   for (auto Iter = KnownComputations.begin(), End = KnownComputations.end();
   7068        Iter != End;) {
   7069     // Disable the folding if its dest may be live beyond this block.
   7070     if (Iter->second.IsLiveOut || Iter->second.NumUses > 1) {
   7071       Iter = KnownComputations.erase(Iter);
   7072       continue;
   7073     }
   7074 
   7075     // Mark as "dead" rather than outright deleting. This is so that other
   7076     // peephole style optimizations during or before lowering have access to
   7077     // this instruction in undeleted form. See for example
   7078     // tryOptimizedCmpxchgCmpBr().
   7079     Iter->second.Instr->setDead();
   7080     ++Iter;
   7081   }
   7082 }
   7083 
   7084 TargetARM32::Sandboxer::Sandboxer(TargetARM32 *Target,
   7085                                   InstBundleLock::Option BundleOption)
   7086     : Target(Target), BundleOption(BundleOption) {}
   7087 
   7088 TargetARM32::Sandboxer::~Sandboxer() {}
   7089 
   7090 namespace {
   7091 OperandARM32FlexImm *indirectBranchBicMask(Cfg *Func) {
   7092   constexpr uint32_t Imm8 = 0xFC; // 0xC000000F
   7093   constexpr uint32_t RotateAmt = 2;
   7094   return OperandARM32FlexImm::create(Func, IceType_i32, Imm8, RotateAmt);
   7095 }
   7096 
   7097 OperandARM32FlexImm *memOpBicMask(Cfg *Func) {
   7098   constexpr uint32_t Imm8 = 0x0C; // 0xC0000000
   7099   constexpr uint32_t RotateAmt = 2;
   7100   return OperandARM32FlexImm::create(Func, IceType_i32, Imm8, RotateAmt);
   7101 }
   7102 
   7103 static bool baseNeedsBic(Variable *Base) {
   7104   return Base->getRegNum() != RegARM32::Reg_r9 &&
   7105          Base->getRegNum() != RegARM32::Reg_sp;
   7106 }
   7107 } // end of anonymous namespace
   7108 
   7109 void TargetARM32::Sandboxer::createAutoBundle() {
   7110   Bundler = makeUnique<AutoBundle>(Target, BundleOption);
   7111 }
   7112 
   7113 void TargetARM32::Sandboxer::add_sp(Operand *AddAmount) {
   7114   Variable *SP = Target->getPhysicalRegister(RegARM32::Reg_sp);
   7115   if (!Target->NeedSandboxing) {
   7116     Target->_add(SP, SP, AddAmount);
   7117     return;
   7118   }
   7119   createAutoBundle();
   7120   Target->_add(SP, SP, AddAmount);
   7121   Target->_bic(SP, SP, memOpBicMask(Target->Func));
   7122 }
   7123 
   7124 void TargetARM32::Sandboxer::align_sp(size_t Alignment) {
   7125   Variable *SP = Target->getPhysicalRegister(RegARM32::Reg_sp);
   7126   if (!Target->NeedSandboxing) {
   7127     Target->alignRegisterPow2(SP, Alignment);
   7128     return;
   7129   }
   7130   createAutoBundle();
   7131   Target->alignRegisterPow2(SP, Alignment);
   7132   Target->_bic(SP, SP, memOpBicMask(Target->Func));
   7133 }
   7134 
   7135 InstARM32Call *TargetARM32::Sandboxer::bl(Variable *ReturnReg,
   7136                                           Operand *CallTarget) {
   7137   if (Target->NeedSandboxing) {
   7138     createAutoBundle();
   7139     if (auto *CallTargetR = llvm::dyn_cast<Variable>(CallTarget)) {
   7140       Target->_bic(CallTargetR, CallTargetR,
   7141                    indirectBranchBicMask(Target->Func));
   7142     }
   7143   }
   7144   return Target->Context.insert<InstARM32Call>(ReturnReg, CallTarget);
   7145 }
   7146 
   7147 void TargetARM32::Sandboxer::ldr(Variable *Dest, OperandARM32Mem *Mem,
   7148                                  CondARM32::Cond Pred) {
   7149   Variable *MemBase = Mem->getBase();
   7150   if (Target->NeedSandboxing && baseNeedsBic(MemBase)) {
   7151     createAutoBundle();
   7152     assert(!Mem->isRegReg());
   7153     Target->_bic(MemBase, MemBase, memOpBicMask(Target->Func), Pred);
   7154   }
   7155   Target->_ldr(Dest, Mem, Pred);
   7156 }
   7157 
   7158 void TargetARM32::Sandboxer::ldrex(Variable *Dest, OperandARM32Mem *Mem,
   7159                                    CondARM32::Cond Pred) {
   7160   Variable *MemBase = Mem->getBase();
   7161   if (Target->NeedSandboxing && baseNeedsBic(MemBase)) {
   7162     createAutoBundle();
   7163     assert(!Mem->isRegReg());
   7164     Target->_bic(MemBase, MemBase, memOpBicMask(Target->Func), Pred);
   7165   }
   7166   Target->_ldrex(Dest, Mem, Pred);
   7167 }
   7168 
   7169 void TargetARM32::Sandboxer::reset_sp(Variable *Src) {
   7170   Variable *SP = Target->getPhysicalRegister(RegARM32::Reg_sp);
   7171   if (!Target->NeedSandboxing) {
   7172     Target->_mov_redefined(SP, Src);
   7173     return;
   7174   }
   7175   createAutoBundle();
   7176   Target->_mov_redefined(SP, Src);
   7177   Target->_bic(SP, SP, memOpBicMask(Target->Func));
   7178 }
   7179 
   7180 void TargetARM32::Sandboxer::ret(Variable *RetAddr, Variable *RetValue) {
   7181   if (Target->NeedSandboxing) {
   7182     createAutoBundle();
   7183     Target->_bic(RetAddr, RetAddr, indirectBranchBicMask(Target->Func));
   7184   }
   7185   Target->_ret(RetAddr, RetValue);
   7186 }
   7187 
   7188 void TargetARM32::Sandboxer::str(Variable *Src, OperandARM32Mem *Mem,
   7189                                  CondARM32::Cond Pred) {
   7190   Variable *MemBase = Mem->getBase();
   7191   if (Target->NeedSandboxing && baseNeedsBic(MemBase)) {
   7192     createAutoBundle();
   7193     assert(!Mem->isRegReg());
   7194     Target->_bic(MemBase, MemBase, memOpBicMask(Target->Func), Pred);
   7195   }
   7196   Target->_str(Src, Mem, Pred);
   7197 }
   7198 
   7199 void TargetARM32::Sandboxer::strex(Variable *Dest, Variable *Src,
   7200                                    OperandARM32Mem *Mem, CondARM32::Cond Pred) {
   7201   Variable *MemBase = Mem->getBase();
   7202   if (Target->NeedSandboxing && baseNeedsBic(MemBase)) {
   7203     createAutoBundle();
   7204     assert(!Mem->isRegReg());
   7205     Target->_bic(MemBase, MemBase, memOpBicMask(Target->Func), Pred);
   7206   }
   7207   Target->_strex(Dest, Src, Mem, Pred);
   7208 }
   7209 
   7210 void TargetARM32::Sandboxer::sub_sp(Operand *SubAmount) {
   7211   Variable *SP = Target->getPhysicalRegister(RegARM32::Reg_sp);
   7212   if (!Target->NeedSandboxing) {
   7213     Target->_sub(SP, SP, SubAmount);
   7214     return;
   7215   }
   7216   createAutoBundle();
   7217   Target->_sub(SP, SP, SubAmount);
   7218   Target->_bic(SP, SP, memOpBicMask(Target->Func));
   7219 }
   7220 
   7221 TargetDataARM32::TargetDataARM32(GlobalContext *Ctx)
   7222     : TargetDataLowering(Ctx) {}
   7223 
   7224 void TargetDataARM32::lowerGlobals(const VariableDeclarationList &Vars,
   7225                                    const std::string &SectionSuffix) {
   7226   const bool IsPIC = getFlags().getUseNonsfi();
   7227   switch (getFlags().getOutFileType()) {
   7228   case FT_Elf: {
   7229     ELFObjectWriter *Writer = Ctx->getObjectWriter();
   7230     Writer->writeDataSection(Vars, llvm::ELF::R_ARM_ABS32, SectionSuffix,
   7231                              IsPIC);
   7232   } break;
   7233   case FT_Asm:
   7234   case FT_Iasm: {
   7235     OstreamLocker _(Ctx);
   7236     for (const VariableDeclaration *Var : Vars) {
   7237       if (getFlags().matchTranslateOnly(Var->getName(), 0)) {
   7238         emitGlobal(*Var, SectionSuffix);
   7239       }
   7240     }
   7241   } break;
   7242   }
   7243 }
   7244 
   7245 namespace {
   7246 template <typename T> struct ConstantPoolEmitterTraits;
   7247 
   7248 static_assert(sizeof(uint64_t) == 8,
   7249               "uint64_t is supposed to be 8 bytes wide.");
   7250 
   7251 // TODO(jpp): implement the following when implementing constant randomization:
   7252 //  * template <> struct ConstantPoolEmitterTraits<uint8_t>
   7253 //  * template <> struct ConstantPoolEmitterTraits<uint16_t>
   7254 //  * template <> struct ConstantPoolEmitterTraits<uint32_t>
   7255 template <> struct ConstantPoolEmitterTraits<float> {
   7256   using ConstantType = ConstantFloat;
   7257   static constexpr Type IceType = IceType_f32;
   7258   // AsmTag and TypeName can't be constexpr because llvm::StringRef is unhappy
   7259   // about them being constexpr.
   7260   static const char AsmTag[];
   7261   static const char TypeName[];
   7262   static uint64_t bitcastToUint64(float Value) {
   7263     static_assert(sizeof(Value) == sizeof(uint32_t),
   7264                   "Float should be 4 bytes.");
   7265     const uint32_t IntValue = Utils::bitCopy<uint32_t>(Value);
   7266     return static_cast<uint64_t>(IntValue);
   7267   }
   7268 };
   7269 const char ConstantPoolEmitterTraits<float>::AsmTag[] = ".long";
   7270 const char ConstantPoolEmitterTraits<float>::TypeName[] = "f32";
   7271 
   7272 template <> struct ConstantPoolEmitterTraits<double> {
   7273   using ConstantType = ConstantDouble;
   7274   static constexpr Type IceType = IceType_f64;
   7275   static const char AsmTag[];
   7276   static const char TypeName[];
   7277   static uint64_t bitcastToUint64(double Value) {
   7278     static_assert(sizeof(double) == sizeof(uint64_t),
   7279                   "Double should be 8 bytes.");
   7280     return Utils::bitCopy<uint64_t>(Value);
   7281   }
   7282 };
   7283 const char ConstantPoolEmitterTraits<double>::AsmTag[] = ".quad";
   7284 const char ConstantPoolEmitterTraits<double>::TypeName[] = "f64";
   7285 
   7286 template <typename T>
   7287 void emitConstant(
   7288     Ostream &Str,
   7289     const typename ConstantPoolEmitterTraits<T>::ConstantType *Const) {
   7290   using Traits = ConstantPoolEmitterTraits<T>;
   7291   Str << Const->getLabelName();
   7292   Str << ":\n\t" << Traits::AsmTag << "\t0x";
   7293   T Value = Const->getValue();
   7294   Str.write_hex(Traits::bitcastToUint64(Value));
   7295   Str << "\t/* " << Traits::TypeName << " " << Value << " */\n";
   7296 }
   7297 
   7298 template <typename T> void emitConstantPool(GlobalContext *Ctx) {
   7299   if (!BuildDefs::dump()) {
   7300     return;
   7301   }
   7302 
   7303   using Traits = ConstantPoolEmitterTraits<T>;
   7304   static constexpr size_t MinimumAlignment = 4;
   7305   SizeT Align = std::max(MinimumAlignment, typeAlignInBytes(Traits::IceType));
   7306   assert((Align % 4) == 0 && "Constants should be aligned");
   7307   Ostream &Str = Ctx->getStrEmit();
   7308   ConstantList Pool = Ctx->getConstantPool(Traits::IceType);
   7309 
   7310   Str << "\t.section\t.rodata.cst" << Align << ",\"aM\",%progbits," << Align
   7311       << "\n"
   7312       << "\t.align\t" << Align << "\n";
   7313 
   7314   if (getFlags().getReorderPooledConstants()) {
   7315     // TODO(jpp): add constant pooling.
   7316     UnimplementedError(getFlags());
   7317   }
   7318 
   7319   for (Constant *C : Pool) {
   7320     if (!C->getShouldBePooled()) {
   7321       continue;
   7322     }
   7323 
   7324     emitConstant<T>(Str, llvm::dyn_cast<typename Traits::ConstantType>(C));
   7325   }
   7326 }
   7327 } // end of anonymous namespace
   7328 
   7329 void TargetDataARM32::lowerConstants() {
   7330   if (getFlags().getDisableTranslation())
   7331     return;
   7332   switch (getFlags().getOutFileType()) {
   7333   case FT_Elf: {
   7334     ELFObjectWriter *Writer = Ctx->getObjectWriter();
   7335     Writer->writeConstantPool<ConstantFloat>(IceType_f32);
   7336     Writer->writeConstantPool<ConstantDouble>(IceType_f64);
   7337   } break;
   7338   case FT_Asm:
   7339   case FT_Iasm: {
   7340     OstreamLocker _(Ctx);
   7341     emitConstantPool<float>(Ctx);
   7342     emitConstantPool<double>(Ctx);
   7343     break;
   7344   }
   7345   }
   7346 }
   7347 
   7348 void TargetDataARM32::lowerJumpTables() {
   7349   if (getFlags().getDisableTranslation())
   7350     return;
   7351   switch (getFlags().getOutFileType()) {
   7352   case FT_Elf:
   7353     if (!Ctx->getJumpTables().empty()) {
   7354       llvm::report_fatal_error("ARM32 does not support jump tables yet.");
   7355     }
   7356     break;
   7357   case FT_Asm:
   7358     // Already emitted from Cfg
   7359     break;
   7360   case FT_Iasm: {
   7361     // TODO(kschimpf): Fill this in when we get more information.
   7362     break;
   7363   }
   7364   }
   7365 }
   7366 
   7367 TargetHeaderARM32::TargetHeaderARM32(GlobalContext *Ctx)
   7368     : TargetHeaderLowering(Ctx), CPUFeatures(getFlags()) {}
   7369 
   7370 void TargetHeaderARM32::lower() {
   7371   OstreamLocker _(Ctx);
   7372   Ostream &Str = Ctx->getStrEmit();
   7373   Str << ".syntax unified\n";
   7374   // Emit build attributes in format: .eabi_attribute TAG, VALUE. See Sec. 2 of
   7375   // "Addenda to, and Errata in the ABI for the ARM architecture"
   7376   // http://infocenter.arm.com
   7377   //                  /help/topic/com.arm.doc.ihi0045d/IHI0045D_ABI_addenda.pdf
   7378   //
   7379   // Tag_conformance should be be emitted first in a file-scope sub-subsection
   7380   // of the first public subsection of the attributes.
   7381   Str << ".eabi_attribute 67, \"2.09\"      @ Tag_conformance\n";
   7382   // Chromebooks are at least A15, but do A9 for higher compat. For some
   7383   // reason, the LLVM ARM asm parser has the .cpu directive override the mattr
   7384   // specified on the commandline. So to test hwdiv, we need to set the .cpu
   7385   // directive higher (can't just rely on --mattr=...).
   7386   if (CPUFeatures.hasFeature(TargetARM32Features::HWDivArm)) {
   7387     Str << ".cpu    cortex-a15\n";
   7388   } else {
   7389     Str << ".cpu    cortex-a9\n";
   7390   }
   7391   Str << ".eabi_attribute 6, 10   @ Tag_CPU_arch: ARMv7\n"
   7392       << ".eabi_attribute 7, 65   @ Tag_CPU_arch_profile: App profile\n";
   7393   Str << ".eabi_attribute 8, 1    @ Tag_ARM_ISA_use: Yes\n"
   7394       << ".eabi_attribute 9, 2    @ Tag_THUMB_ISA_use: Thumb-2\n";
   7395   Str << ".fpu    neon\n"
   7396       << ".eabi_attribute 17, 1   @ Tag_ABI_PCS_GOT_use: permit directly\n"
   7397       << ".eabi_attribute 20, 1   @ Tag_ABI_FP_denormal\n"
   7398       << ".eabi_attribute 21, 1   @ Tag_ABI_FP_exceptions\n"
   7399       << ".eabi_attribute 23, 3   @ Tag_ABI_FP_number_model: IEEE 754\n"
   7400       << ".eabi_attribute 34, 1   @ Tag_CPU_unaligned_access\n"
   7401       << ".eabi_attribute 24, 1   @ Tag_ABI_align_needed: 8-byte\n"
   7402       << ".eabi_attribute 25, 1   @ Tag_ABI_align_preserved: 8-byte\n"
   7403       << ".eabi_attribute 28, 1   @ Tag_ABI_VFP_args\n"
   7404       << ".eabi_attribute 36, 1   @ Tag_FP_HP_extension\n"
   7405       << ".eabi_attribute 38, 1   @ Tag_ABI_FP_16bit_format\n"
   7406       << ".eabi_attribute 42, 1   @ Tag_MPextension_use\n"
   7407       << ".eabi_attribute 68, 1   @ Tag_Virtualization_use\n";
   7408   if (CPUFeatures.hasFeature(TargetARM32Features::HWDivArm)) {
   7409     Str << ".eabi_attribute 44, 2   @ Tag_DIV_use\n";
   7410   }
   7411   // Technically R9 is used for TLS with Sandboxing, and we reserve it.
   7412   // However, for compatibility with current NaCl LLVM, don't claim that.
   7413   Str << ".eabi_attribute 14, 3   @ Tag_ABI_PCS_R9_use: Not used\n";
   7414 }
   7415 
   7416 SmallBitVector TargetARM32::TypeToRegisterSet[RegARM32::RCARM32_NUM];
   7417 SmallBitVector TargetARM32::TypeToRegisterSetUnfiltered[RegARM32::RCARM32_NUM];
   7418 SmallBitVector TargetARM32::RegisterAliases[RegARM32::Reg_NUM];
   7419 
   7420 } // end of namespace ARM32
   7421 } // end of namespace Ice
   7422