Home | History | Annotate | Download | only in CodeGen
      1 //===-- AtomicExpandPass.cpp - Expand atomic instructions -------===//
      2 //
      3 //                     The LLVM Compiler Infrastructure
      4 //
      5 // This file is distributed under the University of Illinois Open Source
      6 // License. See LICENSE.TXT for details.
      7 //
      8 //===----------------------------------------------------------------------===//
      9 //
     10 // This file contains a pass (at IR level) to replace atomic instructions with
     11 // __atomic_* library calls, or target specific instruction which implement the
     12 // same semantics in a way which better fits the target backend.  This can
     13 // include the use of (intrinsic-based) load-linked/store-conditional loops,
     14 // AtomicCmpXchg, or type coercions.
     15 //
     16 //===----------------------------------------------------------------------===//
     17 
     18 #include "llvm/CodeGen/AtomicExpandUtils.h"
     19 #include "llvm/CodeGen/Passes.h"
     20 #include "llvm/IR/Function.h"
     21 #include "llvm/IR/IRBuilder.h"
     22 #include "llvm/IR/InstIterator.h"
     23 #include "llvm/IR/Instructions.h"
     24 #include "llvm/IR/Intrinsics.h"
     25 #include "llvm/IR/Module.h"
     26 #include "llvm/Support/Debug.h"
     27 #include "llvm/Support/raw_ostream.h"
     28 #include "llvm/Target/TargetLowering.h"
     29 #include "llvm/Target/TargetMachine.h"
     30 #include "llvm/Target/TargetSubtargetInfo.h"
     31 
     32 using namespace llvm;
     33 
     34 #define DEBUG_TYPE "atomic-expand"
     35 
     36 namespace {
     37   class AtomicExpand: public FunctionPass {
     38     const TargetMachine *TM;
     39     const TargetLowering *TLI;
     40   public:
     41     static char ID; // Pass identification, replacement for typeid
     42     explicit AtomicExpand(const TargetMachine *TM = nullptr)
     43       : FunctionPass(ID), TM(TM), TLI(nullptr) {
     44       initializeAtomicExpandPass(*PassRegistry::getPassRegistry());
     45     }
     46 
     47     bool runOnFunction(Function &F) override;
     48 
     49   private:
     50     bool bracketInstWithFences(Instruction *I, AtomicOrdering Order,
     51                                bool IsStore, bool IsLoad);
     52     IntegerType *getCorrespondingIntegerType(Type *T, const DataLayout &DL);
     53     LoadInst *convertAtomicLoadToIntegerType(LoadInst *LI);
     54     bool tryExpandAtomicLoad(LoadInst *LI);
     55     bool expandAtomicLoadToLL(LoadInst *LI);
     56     bool expandAtomicLoadToCmpXchg(LoadInst *LI);
     57     StoreInst *convertAtomicStoreToIntegerType(StoreInst *SI);
     58     bool expandAtomicStore(StoreInst *SI);
     59     bool tryExpandAtomicRMW(AtomicRMWInst *AI);
     60     Value *
     61     insertRMWLLSCLoop(IRBuilder<> &Builder, Type *ResultTy, Value *Addr,
     62                       AtomicOrdering MemOpOrder,
     63                       function_ref<Value *(IRBuilder<> &, Value *)> PerformOp);
     64     void expandAtomicOpToLLSC(
     65         Instruction *I, Type *ResultTy, Value *Addr, AtomicOrdering MemOpOrder,
     66         function_ref<Value *(IRBuilder<> &, Value *)> PerformOp);
     67     void expandPartwordAtomicRMW(
     68         AtomicRMWInst *I,
     69         TargetLoweringBase::AtomicExpansionKind ExpansionKind);
     70     void expandPartwordCmpXchg(AtomicCmpXchgInst *I);
     71 
     72     AtomicCmpXchgInst *convertCmpXchgToIntegerType(AtomicCmpXchgInst *CI);
     73     static Value *insertRMWCmpXchgLoop(
     74         IRBuilder<> &Builder, Type *ResultType, Value *Addr,
     75         AtomicOrdering MemOpOrder,
     76         function_ref<Value *(IRBuilder<> &, Value *)> PerformOp,
     77         CreateCmpXchgInstFun CreateCmpXchg);
     78 
     79     bool expandAtomicCmpXchg(AtomicCmpXchgInst *CI);
     80     bool isIdempotentRMW(AtomicRMWInst *AI);
     81     bool simplifyIdempotentRMW(AtomicRMWInst *AI);
     82 
     83     bool expandAtomicOpToLibcall(Instruction *I, unsigned Size, unsigned Align,
     84                                  Value *PointerOperand, Value *ValueOperand,
     85                                  Value *CASExpected, AtomicOrdering Ordering,
     86                                  AtomicOrdering Ordering2,
     87                                  ArrayRef<RTLIB::Libcall> Libcalls);
     88     void expandAtomicLoadToLibcall(LoadInst *LI);
     89     void expandAtomicStoreToLibcall(StoreInst *LI);
     90     void expandAtomicRMWToLibcall(AtomicRMWInst *I);
     91     void expandAtomicCASToLibcall(AtomicCmpXchgInst *I);
     92 
     93     friend bool
     94     llvm::expandAtomicRMWToCmpXchg(AtomicRMWInst *AI,
     95                                    CreateCmpXchgInstFun CreateCmpXchg);
     96   };
     97 }
     98 
     99 char AtomicExpand::ID = 0;
    100 char &llvm::AtomicExpandID = AtomicExpand::ID;
    101 INITIALIZE_TM_PASS(AtomicExpand, "atomic-expand", "Expand Atomic instructions",
    102                    false, false)
    103 
    104 FunctionPass *llvm::createAtomicExpandPass(const TargetMachine *TM) {
    105   return new AtomicExpand(TM);
    106 }
    107 
    108 namespace {
    109 // Helper functions to retrieve the size of atomic instructions.
    110 unsigned getAtomicOpSize(LoadInst *LI) {
    111   const DataLayout &DL = LI->getModule()->getDataLayout();
    112   return DL.getTypeStoreSize(LI->getType());
    113 }
    114 
    115 unsigned getAtomicOpSize(StoreInst *SI) {
    116   const DataLayout &DL = SI->getModule()->getDataLayout();
    117   return DL.getTypeStoreSize(SI->getValueOperand()->getType());
    118 }
    119 
    120 unsigned getAtomicOpSize(AtomicRMWInst *RMWI) {
    121   const DataLayout &DL = RMWI->getModule()->getDataLayout();
    122   return DL.getTypeStoreSize(RMWI->getValOperand()->getType());
    123 }
    124 
    125 unsigned getAtomicOpSize(AtomicCmpXchgInst *CASI) {
    126   const DataLayout &DL = CASI->getModule()->getDataLayout();
    127   return DL.getTypeStoreSize(CASI->getCompareOperand()->getType());
    128 }
    129 
    130 // Helper functions to retrieve the alignment of atomic instructions.
    131 unsigned getAtomicOpAlign(LoadInst *LI) {
    132   unsigned Align = LI->getAlignment();
    133   // In the future, if this IR restriction is relaxed, we should
    134   // return DataLayout::getABITypeAlignment when there's no align
    135   // value.
    136   assert(Align != 0 && "An atomic LoadInst always has an explicit alignment");
    137   return Align;
    138 }
    139 
    140 unsigned getAtomicOpAlign(StoreInst *SI) {
    141   unsigned Align = SI->getAlignment();
    142   // In the future, if this IR restriction is relaxed, we should
    143   // return DataLayout::getABITypeAlignment when there's no align
    144   // value.
    145   assert(Align != 0 && "An atomic StoreInst always has an explicit alignment");
    146   return Align;
    147 }
    148 
    149 unsigned getAtomicOpAlign(AtomicRMWInst *RMWI) {
    150   // TODO(PR27168): This instruction has no alignment attribute, but unlike the
    151   // default alignment for load/store, the default here is to assume
    152   // it has NATURAL alignment, not DataLayout-specified alignment.
    153   const DataLayout &DL = RMWI->getModule()->getDataLayout();
    154   return DL.getTypeStoreSize(RMWI->getValOperand()->getType());
    155 }
    156 
    157 unsigned getAtomicOpAlign(AtomicCmpXchgInst *CASI) {
    158   // TODO(PR27168): same comment as above.
    159   const DataLayout &DL = CASI->getModule()->getDataLayout();
    160   return DL.getTypeStoreSize(CASI->getCompareOperand()->getType());
    161 }
    162 
    163 // Determine if a particular atomic operation has a supported size,
    164 // and is of appropriate alignment, to be passed through for target
    165 // lowering. (Versus turning into a __atomic libcall)
    166 template <typename Inst>
    167 bool atomicSizeSupported(const TargetLowering *TLI, Inst *I) {
    168   unsigned Size = getAtomicOpSize(I);
    169   unsigned Align = getAtomicOpAlign(I);
    170   return Align >= Size && Size <= TLI->getMaxAtomicSizeInBitsSupported() / 8;
    171 }
    172 
    173 } // end anonymous namespace
    174 
    175 bool AtomicExpand::runOnFunction(Function &F) {
    176   if (!TM || !TM->getSubtargetImpl(F)->enableAtomicExpand())
    177     return false;
    178   TLI = TM->getSubtargetImpl(F)->getTargetLowering();
    179 
    180   SmallVector<Instruction *, 1> AtomicInsts;
    181 
    182   // Changing control-flow while iterating through it is a bad idea, so gather a
    183   // list of all atomic instructions before we start.
    184   for (inst_iterator II = inst_begin(F), E = inst_end(F); II != E; ++II) {
    185     Instruction *I = &*II;
    186     if (I->isAtomic() && !isa<FenceInst>(I))
    187       AtomicInsts.push_back(I);
    188   }
    189 
    190   bool MadeChange = false;
    191   for (auto I : AtomicInsts) {
    192     auto LI = dyn_cast<LoadInst>(I);
    193     auto SI = dyn_cast<StoreInst>(I);
    194     auto RMWI = dyn_cast<AtomicRMWInst>(I);
    195     auto CASI = dyn_cast<AtomicCmpXchgInst>(I);
    196     assert((LI || SI || RMWI || CASI) && "Unknown atomic instruction");
    197 
    198     // If the Size/Alignment is not supported, replace with a libcall.
    199     if (LI) {
    200       if (!atomicSizeSupported(TLI, LI)) {
    201         expandAtomicLoadToLibcall(LI);
    202         MadeChange = true;
    203         continue;
    204       }
    205     } else if (SI) {
    206       if (!atomicSizeSupported(TLI, SI)) {
    207         expandAtomicStoreToLibcall(SI);
    208         MadeChange = true;
    209         continue;
    210       }
    211     } else if (RMWI) {
    212       if (!atomicSizeSupported(TLI, RMWI)) {
    213         expandAtomicRMWToLibcall(RMWI);
    214         MadeChange = true;
    215         continue;
    216       }
    217     } else if (CASI) {
    218       if (!atomicSizeSupported(TLI, CASI)) {
    219         expandAtomicCASToLibcall(CASI);
    220         MadeChange = true;
    221         continue;
    222       }
    223     }
    224 
    225     if (TLI->shouldInsertFencesForAtomic(I)) {
    226       auto FenceOrdering = AtomicOrdering::Monotonic;
    227       bool IsStore, IsLoad;
    228       if (LI && isAcquireOrStronger(LI->getOrdering())) {
    229         FenceOrdering = LI->getOrdering();
    230         LI->setOrdering(AtomicOrdering::Monotonic);
    231         IsStore = false;
    232         IsLoad = true;
    233       } else if (SI && isReleaseOrStronger(SI->getOrdering())) {
    234         FenceOrdering = SI->getOrdering();
    235         SI->setOrdering(AtomicOrdering::Monotonic);
    236         IsStore = true;
    237         IsLoad = false;
    238       } else if (RMWI && (isReleaseOrStronger(RMWI->getOrdering()) ||
    239                           isAcquireOrStronger(RMWI->getOrdering()))) {
    240         FenceOrdering = RMWI->getOrdering();
    241         RMWI->setOrdering(AtomicOrdering::Monotonic);
    242         IsStore = IsLoad = true;
    243       } else if (CASI && !TLI->shouldExpandAtomicCmpXchgInIR(CASI) &&
    244                  (isReleaseOrStronger(CASI->getSuccessOrdering()) ||
    245                   isAcquireOrStronger(CASI->getSuccessOrdering()))) {
    246         // If a compare and swap is lowered to LL/SC, we can do smarter fence
    247         // insertion, with a stronger one on the success path than on the
    248         // failure path. As a result, fence insertion is directly done by
    249         // expandAtomicCmpXchg in that case.
    250         FenceOrdering = CASI->getSuccessOrdering();
    251         CASI->setSuccessOrdering(AtomicOrdering::Monotonic);
    252         CASI->setFailureOrdering(AtomicOrdering::Monotonic);
    253         IsStore = IsLoad = true;
    254       }
    255 
    256       if (FenceOrdering != AtomicOrdering::Monotonic) {
    257         MadeChange |= bracketInstWithFences(I, FenceOrdering, IsStore, IsLoad);
    258       }
    259     }
    260 
    261     if (LI) {
    262       if (LI->getType()->isFloatingPointTy()) {
    263         // TODO: add a TLI hook to control this so that each target can
    264         // convert to lowering the original type one at a time.
    265         LI = convertAtomicLoadToIntegerType(LI);
    266         assert(LI->getType()->isIntegerTy() && "invariant broken");
    267         MadeChange = true;
    268       }
    269 
    270       MadeChange |= tryExpandAtomicLoad(LI);
    271     } else if (SI) {
    272       if (SI->getValueOperand()->getType()->isFloatingPointTy()) {
    273         // TODO: add a TLI hook to control this so that each target can
    274         // convert to lowering the original type one at a time.
    275         SI = convertAtomicStoreToIntegerType(SI);
    276         assert(SI->getValueOperand()->getType()->isIntegerTy() &&
    277                "invariant broken");
    278         MadeChange = true;
    279       }
    280 
    281       if (TLI->shouldExpandAtomicStoreInIR(SI))
    282         MadeChange |= expandAtomicStore(SI);
    283     } else if (RMWI) {
    284       // There are two different ways of expanding RMW instructions:
    285       // - into a load if it is idempotent
    286       // - into a Cmpxchg/LL-SC loop otherwise
    287       // we try them in that order.
    288 
    289       if (isIdempotentRMW(RMWI) && simplifyIdempotentRMW(RMWI)) {
    290         MadeChange = true;
    291       } else {
    292         MadeChange |= tryExpandAtomicRMW(RMWI);
    293       }
    294     } else if (CASI) {
    295       // TODO: when we're ready to make the change at the IR level, we can
    296       // extend convertCmpXchgToInteger for floating point too.
    297       assert(!CASI->getCompareOperand()->getType()->isFloatingPointTy() &&
    298              "unimplemented - floating point not legal at IR level");
    299       if (CASI->getCompareOperand()->getType()->isPointerTy() ) {
    300         // TODO: add a TLI hook to control this so that each target can
    301         // convert to lowering the original type one at a time.
    302         CASI = convertCmpXchgToIntegerType(CASI);
    303         assert(CASI->getCompareOperand()->getType()->isIntegerTy() &&
    304                "invariant broken");
    305         MadeChange = true;
    306       }
    307 
    308       unsigned MinCASSize = TLI->getMinCmpXchgSizeInBits() / 8;
    309       unsigned ValueSize = getAtomicOpSize(CASI);
    310       if (ValueSize < MinCASSize) {
    311         assert(!TLI->shouldExpandAtomicCmpXchgInIR(CASI) &&
    312                "MinCmpXchgSizeInBits not yet supported for LL/SC expansions.");
    313         expandPartwordCmpXchg(CASI);
    314       } else {
    315         if (TLI->shouldExpandAtomicCmpXchgInIR(CASI))
    316           MadeChange |= expandAtomicCmpXchg(CASI);
    317       }
    318     }
    319   }
    320   return MadeChange;
    321 }
    322 
    323 bool AtomicExpand::bracketInstWithFences(Instruction *I, AtomicOrdering Order,
    324                                          bool IsStore, bool IsLoad) {
    325   IRBuilder<> Builder(I);
    326 
    327   auto LeadingFence = TLI->emitLeadingFence(Builder, Order, IsStore, IsLoad);
    328 
    329   auto TrailingFence = TLI->emitTrailingFence(Builder, Order, IsStore, IsLoad);
    330   // The trailing fence is emitted before the instruction instead of after
    331   // because there is no easy way of setting Builder insertion point after
    332   // an instruction. So we must erase it from the BB, and insert it back
    333   // in the right place.
    334   // We have a guard here because not every atomic operation generates a
    335   // trailing fence.
    336   if (TrailingFence) {
    337     TrailingFence->removeFromParent();
    338     TrailingFence->insertAfter(I);
    339   }
    340 
    341   return (LeadingFence || TrailingFence);
    342 }
    343 
    344 /// Get the iX type with the same bitwidth as T.
    345 IntegerType *AtomicExpand::getCorrespondingIntegerType(Type *T,
    346                                                        const DataLayout &DL) {
    347   EVT VT = TLI->getValueType(DL, T);
    348   unsigned BitWidth = VT.getStoreSizeInBits();
    349   assert(BitWidth == VT.getSizeInBits() && "must be a power of two");
    350   return IntegerType::get(T->getContext(), BitWidth);
    351 }
    352 
    353 /// Convert an atomic load of a non-integral type to an integer load of the
    354 /// equivalent bitwidth.  See the function comment on
    355 /// convertAtomicStoreToIntegerType for background.
    356 LoadInst *AtomicExpand::convertAtomicLoadToIntegerType(LoadInst *LI) {
    357   auto *M = LI->getModule();
    358   Type *NewTy = getCorrespondingIntegerType(LI->getType(),
    359                                             M->getDataLayout());
    360 
    361   IRBuilder<> Builder(LI);
    362 
    363   Value *Addr = LI->getPointerOperand();
    364   Type *PT = PointerType::get(NewTy,
    365                               Addr->getType()->getPointerAddressSpace());
    366   Value *NewAddr = Builder.CreateBitCast(Addr, PT);
    367 
    368   auto *NewLI = Builder.CreateLoad(NewAddr);
    369   NewLI->setAlignment(LI->getAlignment());
    370   NewLI->setVolatile(LI->isVolatile());
    371   NewLI->setAtomic(LI->getOrdering(), LI->getSynchScope());
    372   DEBUG(dbgs() << "Replaced " << *LI << " with " << *NewLI << "\n");
    373 
    374   Value *NewVal = Builder.CreateBitCast(NewLI, LI->getType());
    375   LI->replaceAllUsesWith(NewVal);
    376   LI->eraseFromParent();
    377   return NewLI;
    378 }
    379 
    380 bool AtomicExpand::tryExpandAtomicLoad(LoadInst *LI) {
    381   switch (TLI->shouldExpandAtomicLoadInIR(LI)) {
    382   case TargetLoweringBase::AtomicExpansionKind::None:
    383     return false;
    384   case TargetLoweringBase::AtomicExpansionKind::LLSC:
    385     expandAtomicOpToLLSC(
    386         LI, LI->getType(), LI->getPointerOperand(), LI->getOrdering(),
    387         [](IRBuilder<> &Builder, Value *Loaded) { return Loaded; });
    388     return true;
    389   case TargetLoweringBase::AtomicExpansionKind::LLOnly:
    390     return expandAtomicLoadToLL(LI);
    391   case TargetLoweringBase::AtomicExpansionKind::CmpXChg:
    392     return expandAtomicLoadToCmpXchg(LI);
    393   }
    394   llvm_unreachable("Unhandled case in tryExpandAtomicLoad");
    395 }
    396 
    397 bool AtomicExpand::expandAtomicLoadToLL(LoadInst *LI) {
    398   IRBuilder<> Builder(LI);
    399 
    400   // On some architectures, load-linked instructions are atomic for larger
    401   // sizes than normal loads. For example, the only 64-bit load guaranteed
    402   // to be single-copy atomic by ARM is an ldrexd (A3.5.3).
    403   Value *Val =
    404       TLI->emitLoadLinked(Builder, LI->getPointerOperand(), LI->getOrdering());
    405   TLI->emitAtomicCmpXchgNoStoreLLBalance(Builder);
    406 
    407   LI->replaceAllUsesWith(Val);
    408   LI->eraseFromParent();
    409 
    410   return true;
    411 }
    412 
    413 bool AtomicExpand::expandAtomicLoadToCmpXchg(LoadInst *LI) {
    414   IRBuilder<> Builder(LI);
    415   AtomicOrdering Order = LI->getOrdering();
    416   Value *Addr = LI->getPointerOperand();
    417   Type *Ty = cast<PointerType>(Addr->getType())->getElementType();
    418   Constant *DummyVal = Constant::getNullValue(Ty);
    419 
    420   Value *Pair = Builder.CreateAtomicCmpXchg(
    421       Addr, DummyVal, DummyVal, Order,
    422       AtomicCmpXchgInst::getStrongestFailureOrdering(Order));
    423   Value *Loaded = Builder.CreateExtractValue(Pair, 0, "loaded");
    424 
    425   LI->replaceAllUsesWith(Loaded);
    426   LI->eraseFromParent();
    427 
    428   return true;
    429 }
    430 
    431 /// Convert an atomic store of a non-integral type to an integer store of the
    432 /// equivalent bitwidth.  We used to not support floating point or vector
    433 /// atomics in the IR at all.  The backends learned to deal with the bitcast
    434 /// idiom because that was the only way of expressing the notion of a atomic
    435 /// float or vector store.  The long term plan is to teach each backend to
    436 /// instruction select from the original atomic store, but as a migration
    437 /// mechanism, we convert back to the old format which the backends understand.
    438 /// Each backend will need individual work to recognize the new format.
    439 StoreInst *AtomicExpand::convertAtomicStoreToIntegerType(StoreInst *SI) {
    440   IRBuilder<> Builder(SI);
    441   auto *M = SI->getModule();
    442   Type *NewTy = getCorrespondingIntegerType(SI->getValueOperand()->getType(),
    443                                             M->getDataLayout());
    444   Value *NewVal = Builder.CreateBitCast(SI->getValueOperand(), NewTy);
    445 
    446   Value *Addr = SI->getPointerOperand();
    447   Type *PT = PointerType::get(NewTy,
    448                               Addr->getType()->getPointerAddressSpace());
    449   Value *NewAddr = Builder.CreateBitCast(Addr, PT);
    450 
    451   StoreInst *NewSI = Builder.CreateStore(NewVal, NewAddr);
    452   NewSI->setAlignment(SI->getAlignment());
    453   NewSI->setVolatile(SI->isVolatile());
    454   NewSI->setAtomic(SI->getOrdering(), SI->getSynchScope());
    455   DEBUG(dbgs() << "Replaced " << *SI << " with " << *NewSI << "\n");
    456   SI->eraseFromParent();
    457   return NewSI;
    458 }
    459 
    460 bool AtomicExpand::expandAtomicStore(StoreInst *SI) {
    461   // This function is only called on atomic stores that are too large to be
    462   // atomic if implemented as a native store. So we replace them by an
    463   // atomic swap, that can be implemented for example as a ldrex/strex on ARM
    464   // or lock cmpxchg8/16b on X86, as these are atomic for larger sizes.
    465   // It is the responsibility of the target to only signal expansion via
    466   // shouldExpandAtomicRMW in cases where this is required and possible.
    467   IRBuilder<> Builder(SI);
    468   AtomicRMWInst *AI =
    469       Builder.CreateAtomicRMW(AtomicRMWInst::Xchg, SI->getPointerOperand(),
    470                               SI->getValueOperand(), SI->getOrdering());
    471   SI->eraseFromParent();
    472 
    473   // Now we have an appropriate swap instruction, lower it as usual.
    474   return tryExpandAtomicRMW(AI);
    475 }
    476 
    477 static void createCmpXchgInstFun(IRBuilder<> &Builder, Value *Addr,
    478                                  Value *Loaded, Value *NewVal,
    479                                  AtomicOrdering MemOpOrder,
    480                                  Value *&Success, Value *&NewLoaded) {
    481   Value* Pair = Builder.CreateAtomicCmpXchg(
    482       Addr, Loaded, NewVal, MemOpOrder,
    483       AtomicCmpXchgInst::getStrongestFailureOrdering(MemOpOrder));
    484   Success = Builder.CreateExtractValue(Pair, 1, "success");
    485   NewLoaded = Builder.CreateExtractValue(Pair, 0, "newloaded");
    486 }
    487 
    488 /// Emit IR to implement the given atomicrmw operation on values in registers,
    489 /// returning the new value.
    490 static Value *performAtomicOp(AtomicRMWInst::BinOp Op, IRBuilder<> &Builder,
    491                               Value *Loaded, Value *Inc) {
    492   Value *NewVal;
    493   switch (Op) {
    494   case AtomicRMWInst::Xchg:
    495     return Inc;
    496   case AtomicRMWInst::Add:
    497     return Builder.CreateAdd(Loaded, Inc, "new");
    498   case AtomicRMWInst::Sub:
    499     return Builder.CreateSub(Loaded, Inc, "new");
    500   case AtomicRMWInst::And:
    501     return Builder.CreateAnd(Loaded, Inc, "new");
    502   case AtomicRMWInst::Nand:
    503     return Builder.CreateNot(Builder.CreateAnd(Loaded, Inc), "new");
    504   case AtomicRMWInst::Or:
    505     return Builder.CreateOr(Loaded, Inc, "new");
    506   case AtomicRMWInst::Xor:
    507     return Builder.CreateXor(Loaded, Inc, "new");
    508   case AtomicRMWInst::Max:
    509     NewVal = Builder.CreateICmpSGT(Loaded, Inc);
    510     return Builder.CreateSelect(NewVal, Loaded, Inc, "new");
    511   case AtomicRMWInst::Min:
    512     NewVal = Builder.CreateICmpSLE(Loaded, Inc);
    513     return Builder.CreateSelect(NewVal, Loaded, Inc, "new");
    514   case AtomicRMWInst::UMax:
    515     NewVal = Builder.CreateICmpUGT(Loaded, Inc);
    516     return Builder.CreateSelect(NewVal, Loaded, Inc, "new");
    517   case AtomicRMWInst::UMin:
    518     NewVal = Builder.CreateICmpULE(Loaded, Inc);
    519     return Builder.CreateSelect(NewVal, Loaded, Inc, "new");
    520   default:
    521     llvm_unreachable("Unknown atomic op");
    522   }
    523 }
    524 
    525 bool AtomicExpand::tryExpandAtomicRMW(AtomicRMWInst *AI) {
    526   switch (TLI->shouldExpandAtomicRMWInIR(AI)) {
    527   case TargetLoweringBase::AtomicExpansionKind::None:
    528     return false;
    529   case TargetLoweringBase::AtomicExpansionKind::LLSC: {
    530     unsigned MinCASSize = TLI->getMinCmpXchgSizeInBits() / 8;
    531     unsigned ValueSize = getAtomicOpSize(AI);
    532     if (ValueSize < MinCASSize) {
    533       llvm_unreachable(
    534           "MinCmpXchgSizeInBits not yet supported for LL/SC architectures.");
    535     } else {
    536       auto PerformOp = [&](IRBuilder<> &Builder, Value *Loaded) {
    537         return performAtomicOp(AI->getOperation(), Builder, Loaded,
    538                                AI->getValOperand());
    539       };
    540       expandAtomicOpToLLSC(AI, AI->getType(), AI->getPointerOperand(),
    541                            AI->getOrdering(), PerformOp);
    542     }
    543     return true;
    544   }
    545   case TargetLoweringBase::AtomicExpansionKind::CmpXChg: {
    546     unsigned MinCASSize = TLI->getMinCmpXchgSizeInBits() / 8;
    547     unsigned ValueSize = getAtomicOpSize(AI);
    548     if (ValueSize < MinCASSize) {
    549       expandPartwordAtomicRMW(AI,
    550                               TargetLoweringBase::AtomicExpansionKind::CmpXChg);
    551     } else {
    552       expandAtomicRMWToCmpXchg(AI, createCmpXchgInstFun);
    553     }
    554     return true;
    555   }
    556   default:
    557     llvm_unreachable("Unhandled case in tryExpandAtomicRMW");
    558   }
    559 }
    560 
    561 namespace {
    562 
    563 /// Result values from createMaskInstrs helper.
    564 struct PartwordMaskValues {
    565   Type *WordType;
    566   Type *ValueType;
    567   Value *AlignedAddr;
    568   Value *ShiftAmt;
    569   Value *Mask;
    570   Value *Inv_Mask;
    571 };
    572 } // end anonymous namespace
    573 
    574 /// This is a helper function which builds instructions to provide
    575 /// values necessary for partword atomic operations. It takes an
    576 /// incoming address, Addr, and ValueType, and constructs the address,
    577 /// shift-amounts and masks needed to work with a larger value of size
    578 /// WordSize.
    579 ///
    580 /// AlignedAddr: Addr rounded down to a multiple of WordSize
    581 ///
    582 /// ShiftAmt: Number of bits to right-shift a WordSize value loaded
    583 ///           from AlignAddr for it to have the same value as if
    584 ///           ValueType was loaded from Addr.
    585 ///
    586 /// Mask: Value to mask with the value loaded from AlignAddr to
    587 ///       include only the part that would've been loaded from Addr.
    588 ///
    589 /// Inv_Mask: The inverse of Mask.
    590 
    591 static PartwordMaskValues createMaskInstrs(IRBuilder<> &Builder, Instruction *I,
    592                                            Type *ValueType, Value *Addr,
    593                                            unsigned WordSize) {
    594   PartwordMaskValues Ret;
    595 
    596   BasicBlock *BB = I->getParent();
    597   Function *F = BB->getParent();
    598   Module *M = I->getModule();
    599 
    600   LLVMContext &Ctx = F->getContext();
    601   const DataLayout &DL = M->getDataLayout();
    602 
    603   unsigned ValueSize = DL.getTypeStoreSize(ValueType);
    604 
    605   assert(ValueSize < WordSize);
    606 
    607   Ret.ValueType = ValueType;
    608   Ret.WordType = Type::getIntNTy(Ctx, WordSize * 8);
    609 
    610   Type *WordPtrType =
    611       Ret.WordType->getPointerTo(Addr->getType()->getPointerAddressSpace());
    612 
    613   Value *AddrInt = Builder.CreatePtrToInt(Addr, DL.getIntPtrType(Ctx));
    614   Ret.AlignedAddr = Builder.CreateIntToPtr(
    615       Builder.CreateAnd(AddrInt, ~(uint64_t)(WordSize - 1)), WordPtrType,
    616       "AlignedAddr");
    617 
    618   Value *PtrLSB = Builder.CreateAnd(AddrInt, WordSize - 1, "PtrLSB");
    619   if (DL.isLittleEndian()) {
    620     // turn bytes into bits
    621     Ret.ShiftAmt = Builder.CreateShl(PtrLSB, 3);
    622   } else {
    623     // turn bytes into bits, and count from the other side.
    624     Ret.ShiftAmt =
    625         Builder.CreateShl(Builder.CreateXor(PtrLSB, WordSize - ValueSize), 3);
    626   }
    627 
    628   Ret.ShiftAmt = Builder.CreateTrunc(Ret.ShiftAmt, Ret.WordType, "ShiftAmt");
    629   Ret.Mask = Builder.CreateShl(
    630       ConstantInt::get(Ret.WordType, (1 << ValueSize * 8) - 1), Ret.ShiftAmt,
    631       "Mask");
    632   Ret.Inv_Mask = Builder.CreateNot(Ret.Mask, "Inv_Mask");
    633 
    634   return Ret;
    635 }
    636 
    637 /// Emit IR to implement a masked version of a given atomicrmw
    638 /// operation. (That is, only the bits under the Mask should be
    639 /// affected by the operation)
    640 static Value *performMaskedAtomicOp(AtomicRMWInst::BinOp Op,
    641                                     IRBuilder<> &Builder, Value *Loaded,
    642                                     Value *Shifted_Inc, Value *Inc,
    643                                     const PartwordMaskValues &PMV) {
    644   switch (Op) {
    645   case AtomicRMWInst::Xchg: {
    646     Value *Loaded_MaskOut = Builder.CreateAnd(Loaded, PMV.Inv_Mask);
    647     Value *FinalVal = Builder.CreateOr(Loaded_MaskOut, Shifted_Inc);
    648     return FinalVal;
    649   }
    650   case AtomicRMWInst::Or:
    651   case AtomicRMWInst::Xor:
    652     // Or/Xor won't affect any other bits, so can just be done
    653     // directly.
    654     return performAtomicOp(Op, Builder, Loaded, Shifted_Inc);
    655   case AtomicRMWInst::Add:
    656   case AtomicRMWInst::Sub:
    657   case AtomicRMWInst::And:
    658   case AtomicRMWInst::Nand: {
    659     // The other arithmetic ops need to be masked into place.
    660     Value *NewVal = performAtomicOp(Op, Builder, Loaded, Shifted_Inc);
    661     Value *NewVal_Masked = Builder.CreateAnd(NewVal, PMV.Mask);
    662     Value *Loaded_MaskOut = Builder.CreateAnd(Loaded, PMV.Inv_Mask);
    663     Value *FinalVal = Builder.CreateOr(Loaded_MaskOut, NewVal_Masked);
    664     return FinalVal;
    665   }
    666   case AtomicRMWInst::Max:
    667   case AtomicRMWInst::Min:
    668   case AtomicRMWInst::UMax:
    669   case AtomicRMWInst::UMin: {
    670     // Finally, comparison ops will operate on the full value, so
    671     // truncate down to the original size, and expand out again after
    672     // doing the operation.
    673     Value *Loaded_Shiftdown = Builder.CreateTrunc(
    674         Builder.CreateLShr(Loaded, PMV.ShiftAmt), PMV.ValueType);
    675     Value *NewVal = performAtomicOp(Op, Builder, Loaded_Shiftdown, Inc);
    676     Value *NewVal_Shiftup = Builder.CreateShl(
    677         Builder.CreateZExt(NewVal, PMV.WordType), PMV.ShiftAmt);
    678     Value *Loaded_MaskOut = Builder.CreateAnd(Loaded, PMV.Inv_Mask);
    679     Value *FinalVal = Builder.CreateOr(Loaded_MaskOut, NewVal_Shiftup);
    680     return FinalVal;
    681   }
    682   default:
    683     llvm_unreachable("Unknown atomic op");
    684   }
    685 }
    686 
    687 /// Expand a sub-word atomicrmw operation into an appropriate
    688 /// word-sized operation.
    689 ///
    690 /// It will create an LL/SC or cmpxchg loop, as appropriate, the same
    691 /// way as a typical atomicrmw expansion. The only difference here is
    692 /// that the operation inside of the loop must operate only upon a
    693 /// part of the value.
    694 void AtomicExpand::expandPartwordAtomicRMW(
    695     AtomicRMWInst *AI, TargetLoweringBase::AtomicExpansionKind ExpansionKind) {
    696 
    697   assert(ExpansionKind == TargetLoweringBase::AtomicExpansionKind::CmpXChg);
    698 
    699   AtomicOrdering MemOpOrder = AI->getOrdering();
    700 
    701   IRBuilder<> Builder(AI);
    702 
    703   PartwordMaskValues PMV =
    704       createMaskInstrs(Builder, AI, AI->getType(), AI->getPointerOperand(),
    705                        TLI->getMinCmpXchgSizeInBits() / 8);
    706 
    707   Value *ValOperand_Shifted =
    708       Builder.CreateShl(Builder.CreateZExt(AI->getValOperand(), PMV.WordType),
    709                         PMV.ShiftAmt, "ValOperand_Shifted");
    710 
    711   auto PerformPartwordOp = [&](IRBuilder<> &Builder, Value *Loaded) {
    712     return performMaskedAtomicOp(AI->getOperation(), Builder, Loaded,
    713                                  ValOperand_Shifted, AI->getValOperand(), PMV);
    714   };
    715 
    716   // TODO: When we're ready to support LLSC conversions too, use
    717   // insertRMWLLSCLoop here for ExpansionKind==LLSC.
    718   Value *OldResult =
    719       insertRMWCmpXchgLoop(Builder, PMV.WordType, PMV.AlignedAddr, MemOpOrder,
    720                            PerformPartwordOp, createCmpXchgInstFun);
    721   Value *FinalOldResult = Builder.CreateTrunc(
    722       Builder.CreateLShr(OldResult, PMV.ShiftAmt), PMV.ValueType);
    723   AI->replaceAllUsesWith(FinalOldResult);
    724   AI->eraseFromParent();
    725 }
    726 
    727 void AtomicExpand::expandPartwordCmpXchg(AtomicCmpXchgInst *CI) {
    728   // The basic idea here is that we're expanding a cmpxchg of a
    729   // smaller memory size up to a word-sized cmpxchg. To do this, we
    730   // need to add a retry-loop for strong cmpxchg, so that
    731   // modifications to other parts of the word don't cause a spurious
    732   // failure.
    733 
    734   // This generates code like the following:
    735   //     [[Setup mask values PMV.*]]
    736   //     %NewVal_Shifted = shl i32 %NewVal, %PMV.ShiftAmt
    737   //     %Cmp_Shifted = shl i32 %Cmp, %PMV.ShiftAmt
    738   //     %InitLoaded = load i32* %addr
    739   //     %InitLoaded_MaskOut = and i32 %InitLoaded, %PMV.Inv_Mask
    740   //     br partword.cmpxchg.loop
    741   // partword.cmpxchg.loop:
    742   //     %Loaded_MaskOut = phi i32 [ %InitLoaded_MaskOut, %entry ],
    743   //        [ %OldVal_MaskOut, %partword.cmpxchg.failure ]
    744   //     %FullWord_NewVal = or i32 %Loaded_MaskOut, %NewVal_Shifted
    745   //     %FullWord_Cmp = or i32 %Loaded_MaskOut, %Cmp_Shifted
    746   //     %NewCI = cmpxchg i32* %PMV.AlignedAddr, i32 %FullWord_Cmp,
    747   //        i32 %FullWord_NewVal success_ordering failure_ordering
    748   //     %OldVal = extractvalue { i32, i1 } %NewCI, 0
    749   //     %Success = extractvalue { i32, i1 } %NewCI, 1
    750   //     br i1 %Success, label %partword.cmpxchg.end,
    751   //        label %partword.cmpxchg.failure
    752   // partword.cmpxchg.failure:
    753   //     %OldVal_MaskOut = and i32 %OldVal, %PMV.Inv_Mask
    754   //     %ShouldContinue = icmp ne i32 %Loaded_MaskOut, %OldVal_MaskOut
    755   //     br i1 %ShouldContinue, label %partword.cmpxchg.loop,
    756   //         label %partword.cmpxchg.end
    757   // partword.cmpxchg.end:
    758   //    %tmp1 = lshr i32 %OldVal, %PMV.ShiftAmt
    759   //    %FinalOldVal = trunc i32 %tmp1 to i8
    760   //    %tmp2 = insertvalue { i8, i1 } undef, i8 %FinalOldVal, 0
    761   //    %Res = insertvalue { i8, i1 } %25, i1 %Success, 1
    762 
    763   Value *Addr = CI->getPointerOperand();
    764   Value *Cmp = CI->getCompareOperand();
    765   Value *NewVal = CI->getNewValOperand();
    766 
    767   BasicBlock *BB = CI->getParent();
    768   Function *F = BB->getParent();
    769   IRBuilder<> Builder(CI);
    770   LLVMContext &Ctx = Builder.getContext();
    771 
    772   const int WordSize = TLI->getMinCmpXchgSizeInBits() / 8;
    773 
    774   BasicBlock *EndBB =
    775       BB->splitBasicBlock(CI->getIterator(), "partword.cmpxchg.end");
    776   auto FailureBB =
    777       BasicBlock::Create(Ctx, "partword.cmpxchg.failure", F, EndBB);
    778   auto LoopBB = BasicBlock::Create(Ctx, "partword.cmpxchg.loop", F, FailureBB);
    779 
    780   // The split call above "helpfully" added a branch at the end of BB
    781   // (to the wrong place).
    782   std::prev(BB->end())->eraseFromParent();
    783   Builder.SetInsertPoint(BB);
    784 
    785   PartwordMaskValues PMV = createMaskInstrs(
    786       Builder, CI, CI->getCompareOperand()->getType(), Addr, WordSize);
    787 
    788   // Shift the incoming values over, into the right location in the word.
    789   Value *NewVal_Shifted =
    790       Builder.CreateShl(Builder.CreateZExt(NewVal, PMV.WordType), PMV.ShiftAmt);
    791   Value *Cmp_Shifted =
    792       Builder.CreateShl(Builder.CreateZExt(Cmp, PMV.WordType), PMV.ShiftAmt);
    793 
    794   // Load the entire current word, and mask into place the expected and new
    795   // values
    796   LoadInst *InitLoaded = Builder.CreateLoad(PMV.WordType, PMV.AlignedAddr);
    797   InitLoaded->setVolatile(CI->isVolatile());
    798   Value *InitLoaded_MaskOut = Builder.CreateAnd(InitLoaded, PMV.Inv_Mask);
    799   Builder.CreateBr(LoopBB);
    800 
    801   // partword.cmpxchg.loop:
    802   Builder.SetInsertPoint(LoopBB);
    803   PHINode *Loaded_MaskOut = Builder.CreatePHI(PMV.WordType, 2);
    804   Loaded_MaskOut->addIncoming(InitLoaded_MaskOut, BB);
    805 
    806   // Mask/Or the expected and new values into place in the loaded word.
    807   Value *FullWord_NewVal = Builder.CreateOr(Loaded_MaskOut, NewVal_Shifted);
    808   Value *FullWord_Cmp = Builder.CreateOr(Loaded_MaskOut, Cmp_Shifted);
    809   AtomicCmpXchgInst *NewCI = Builder.CreateAtomicCmpXchg(
    810       PMV.AlignedAddr, FullWord_Cmp, FullWord_NewVal, CI->getSuccessOrdering(),
    811       CI->getFailureOrdering(), CI->getSynchScope());
    812   NewCI->setVolatile(CI->isVolatile());
    813   // When we're building a strong cmpxchg, we need a loop, so you
    814   // might think we could use a weak cmpxchg inside. But, using strong
    815   // allows the below comparison for ShouldContinue, and we're
    816   // expecting the underlying cmpxchg to be a machine instruction,
    817   // which is strong anyways.
    818   NewCI->setWeak(CI->isWeak());
    819 
    820   Value *OldVal = Builder.CreateExtractValue(NewCI, 0);
    821   Value *Success = Builder.CreateExtractValue(NewCI, 1);
    822 
    823   if (CI->isWeak())
    824     Builder.CreateBr(EndBB);
    825   else
    826     Builder.CreateCondBr(Success, EndBB, FailureBB);
    827 
    828   // partword.cmpxchg.failure:
    829   Builder.SetInsertPoint(FailureBB);
    830   // Upon failure, verify that the masked-out part of the loaded value
    831   // has been modified.  If it didn't, abort the cmpxchg, since the
    832   // masked-in part must've.
    833   Value *OldVal_MaskOut = Builder.CreateAnd(OldVal, PMV.Inv_Mask);
    834   Value *ShouldContinue = Builder.CreateICmpNE(Loaded_MaskOut, OldVal_MaskOut);
    835   Builder.CreateCondBr(ShouldContinue, LoopBB, EndBB);
    836 
    837   // Add the second value to the phi from above
    838   Loaded_MaskOut->addIncoming(OldVal_MaskOut, FailureBB);
    839 
    840   // partword.cmpxchg.end:
    841   Builder.SetInsertPoint(CI);
    842 
    843   Value *FinalOldVal = Builder.CreateTrunc(
    844       Builder.CreateLShr(OldVal, PMV.ShiftAmt), PMV.ValueType);
    845   Value *Res = UndefValue::get(CI->getType());
    846   Res = Builder.CreateInsertValue(Res, FinalOldVal, 0);
    847   Res = Builder.CreateInsertValue(Res, Success, 1);
    848 
    849   CI->replaceAllUsesWith(Res);
    850   CI->eraseFromParent();
    851 }
    852 
    853 void AtomicExpand::expandAtomicOpToLLSC(
    854     Instruction *I, Type *ResultType, Value *Addr, AtomicOrdering MemOpOrder,
    855     function_ref<Value *(IRBuilder<> &, Value *)> PerformOp) {
    856   IRBuilder<> Builder(I);
    857   Value *Loaded =
    858       insertRMWLLSCLoop(Builder, ResultType, Addr, MemOpOrder, PerformOp);
    859 
    860   I->replaceAllUsesWith(Loaded);
    861   I->eraseFromParent();
    862 }
    863 
    864 Value *AtomicExpand::insertRMWLLSCLoop(
    865     IRBuilder<> &Builder, Type *ResultTy, Value *Addr,
    866     AtomicOrdering MemOpOrder,
    867     function_ref<Value *(IRBuilder<> &, Value *)> PerformOp) {
    868   LLVMContext &Ctx = Builder.getContext();
    869   BasicBlock *BB = Builder.GetInsertBlock();
    870   Function *F = BB->getParent();
    871 
    872   // Given: atomicrmw some_op iN* %addr, iN %incr ordering
    873   //
    874   // The standard expansion we produce is:
    875   //     [...]
    876   // atomicrmw.start:
    877   //     %loaded = @load.linked(%addr)
    878   //     %new = some_op iN %loaded, %incr
    879   //     %stored = @store_conditional(%new, %addr)
    880   //     %try_again = icmp i32 ne %stored, 0
    881   //     br i1 %try_again, label %loop, label %atomicrmw.end
    882   // atomicrmw.end:
    883   //     [...]
    884   BasicBlock *ExitBB =
    885       BB->splitBasicBlock(Builder.GetInsertPoint(), "atomicrmw.end");
    886   BasicBlock *LoopBB =  BasicBlock::Create(Ctx, "atomicrmw.start", F, ExitBB);
    887 
    888   // The split call above "helpfully" added a branch at the end of BB (to the
    889   // wrong place).
    890   std::prev(BB->end())->eraseFromParent();
    891   Builder.SetInsertPoint(BB);
    892   Builder.CreateBr(LoopBB);
    893 
    894   // Start the main loop block now that we've taken care of the preliminaries.
    895   Builder.SetInsertPoint(LoopBB);
    896   Value *Loaded = TLI->emitLoadLinked(Builder, Addr, MemOpOrder);
    897 
    898   Value *NewVal = PerformOp(Builder, Loaded);
    899 
    900   Value *StoreSuccess =
    901       TLI->emitStoreConditional(Builder, NewVal, Addr, MemOpOrder);
    902   Value *TryAgain = Builder.CreateICmpNE(
    903       StoreSuccess, ConstantInt::get(IntegerType::get(Ctx, 32), 0), "tryagain");
    904   Builder.CreateCondBr(TryAgain, LoopBB, ExitBB);
    905 
    906   Builder.SetInsertPoint(ExitBB, ExitBB->begin());
    907   return Loaded;
    908 }
    909 
    910 /// Convert an atomic cmpxchg of a non-integral type to an integer cmpxchg of
    911 /// the equivalent bitwidth.  We used to not support pointer cmpxchg in the
    912 /// IR.  As a migration step, we convert back to what use to be the standard
    913 /// way to represent a pointer cmpxchg so that we can update backends one by
    914 /// one.
    915 AtomicCmpXchgInst *AtomicExpand::convertCmpXchgToIntegerType(AtomicCmpXchgInst *CI) {
    916   auto *M = CI->getModule();
    917   Type *NewTy = getCorrespondingIntegerType(CI->getCompareOperand()->getType(),
    918                                             M->getDataLayout());
    919 
    920   IRBuilder<> Builder(CI);
    921 
    922   Value *Addr = CI->getPointerOperand();
    923   Type *PT = PointerType::get(NewTy,
    924                               Addr->getType()->getPointerAddressSpace());
    925   Value *NewAddr = Builder.CreateBitCast(Addr, PT);
    926 
    927   Value *NewCmp = Builder.CreatePtrToInt(CI->getCompareOperand(), NewTy);
    928   Value *NewNewVal = Builder.CreatePtrToInt(CI->getNewValOperand(), NewTy);
    929 
    930 
    931   auto *NewCI = Builder.CreateAtomicCmpXchg(NewAddr, NewCmp, NewNewVal,
    932                                             CI->getSuccessOrdering(),
    933                                             CI->getFailureOrdering(),
    934                                             CI->getSynchScope());
    935   NewCI->setVolatile(CI->isVolatile());
    936   NewCI->setWeak(CI->isWeak());
    937   DEBUG(dbgs() << "Replaced " << *CI << " with " << *NewCI << "\n");
    938 
    939   Value *OldVal = Builder.CreateExtractValue(NewCI, 0);
    940   Value *Succ = Builder.CreateExtractValue(NewCI, 1);
    941 
    942   OldVal = Builder.CreateIntToPtr(OldVal, CI->getCompareOperand()->getType());
    943 
    944   Value *Res = UndefValue::get(CI->getType());
    945   Res = Builder.CreateInsertValue(Res, OldVal, 0);
    946   Res = Builder.CreateInsertValue(Res, Succ, 1);
    947 
    948   CI->replaceAllUsesWith(Res);
    949   CI->eraseFromParent();
    950   return NewCI;
    951 }
    952 
    953 
    954 bool AtomicExpand::expandAtomicCmpXchg(AtomicCmpXchgInst *CI) {
    955   AtomicOrdering SuccessOrder = CI->getSuccessOrdering();
    956   AtomicOrdering FailureOrder = CI->getFailureOrdering();
    957   Value *Addr = CI->getPointerOperand();
    958   BasicBlock *BB = CI->getParent();
    959   Function *F = BB->getParent();
    960   LLVMContext &Ctx = F->getContext();
    961   // If shouldInsertFencesForAtomic() returns true, then the target does not
    962   // want to deal with memory orders, and emitLeading/TrailingFence should take
    963   // care of everything. Otherwise, emitLeading/TrailingFence are no-op and we
    964   // should preserve the ordering.
    965   bool ShouldInsertFencesForAtomic = TLI->shouldInsertFencesForAtomic(CI);
    966   AtomicOrdering MemOpOrder =
    967       ShouldInsertFencesForAtomic ? AtomicOrdering::Monotonic : SuccessOrder;
    968 
    969   // In implementations which use a barrier to achieve release semantics, we can
    970   // delay emitting this barrier until we know a store is actually going to be
    971   // attempted. The cost of this delay is that we need 2 copies of the block
    972   // emitting the load-linked, affecting code size.
    973   //
    974   // Ideally, this logic would be unconditional except for the minsize check
    975   // since in other cases the extra blocks naturally collapse down to the
    976   // minimal loop. Unfortunately, this puts too much stress on later
    977   // optimisations so we avoid emitting the extra logic in those cases too.
    978   bool HasReleasedLoadBB = !CI->isWeak() && ShouldInsertFencesForAtomic &&
    979                            SuccessOrder != AtomicOrdering::Monotonic &&
    980                            SuccessOrder != AtomicOrdering::Acquire &&
    981                            !F->optForMinSize();
    982 
    983   // There's no overhead for sinking the release barrier in a weak cmpxchg, so
    984   // do it even on minsize.
    985   bool UseUnconditionalReleaseBarrier = F->optForMinSize() && !CI->isWeak();
    986 
    987   // Given: cmpxchg some_op iN* %addr, iN %desired, iN %new success_ord fail_ord
    988   //
    989   // The full expansion we produce is:
    990   //     [...]
    991   // cmpxchg.start:
    992   //     %unreleasedload = @load.linked(%addr)
    993   //     %should_store = icmp eq %unreleasedload, %desired
    994   //     br i1 %should_store, label %cmpxchg.fencedstore,
    995   //                          label %cmpxchg.nostore
    996   // cmpxchg.releasingstore:
    997   //     fence?
    998   //     br label cmpxchg.trystore
    999   // cmpxchg.trystore:
   1000   //     %loaded.trystore = phi [%unreleasedload, %releasingstore],
   1001   //                            [%releasedload, %cmpxchg.releasedload]
   1002   //     %stored = @store_conditional(%new, %addr)
   1003   //     %success = icmp eq i32 %stored, 0
   1004   //     br i1 %success, label %cmpxchg.success,
   1005   //                     label %cmpxchg.releasedload/%cmpxchg.failure
   1006   // cmpxchg.releasedload:
   1007   //     %releasedload = @load.linked(%addr)
   1008   //     %should_store = icmp eq %releasedload, %desired
   1009   //     br i1 %should_store, label %cmpxchg.trystore,
   1010   //                          label %cmpxchg.failure
   1011   // cmpxchg.success:
   1012   //     fence?
   1013   //     br label %cmpxchg.end
   1014   // cmpxchg.nostore:
   1015   //     %loaded.nostore = phi [%unreleasedload, %cmpxchg.start],
   1016   //                           [%releasedload,
   1017   //                               %cmpxchg.releasedload/%cmpxchg.trystore]
   1018   //     @load_linked_fail_balance()?
   1019   //     br label %cmpxchg.failure
   1020   // cmpxchg.failure:
   1021   //     fence?
   1022   //     br label %cmpxchg.end
   1023   // cmpxchg.end:
   1024   //     %loaded = phi [%loaded.nostore, %cmpxchg.failure],
   1025   //                   [%loaded.trystore, %cmpxchg.trystore]
   1026   //     %success = phi i1 [true, %cmpxchg.success], [false, %cmpxchg.failure]
   1027   //     %restmp = insertvalue { iN, i1 } undef, iN %loaded, 0
   1028   //     %res = insertvalue { iN, i1 } %restmp, i1 %success, 1
   1029   //     [...]
   1030   BasicBlock *ExitBB = BB->splitBasicBlock(CI->getIterator(), "cmpxchg.end");
   1031   auto FailureBB = BasicBlock::Create(Ctx, "cmpxchg.failure", F, ExitBB);
   1032   auto NoStoreBB = BasicBlock::Create(Ctx, "cmpxchg.nostore", F, FailureBB);
   1033   auto SuccessBB = BasicBlock::Create(Ctx, "cmpxchg.success", F, NoStoreBB);
   1034   auto ReleasedLoadBB =
   1035       BasicBlock::Create(Ctx, "cmpxchg.releasedload", F, SuccessBB);
   1036   auto TryStoreBB =
   1037       BasicBlock::Create(Ctx, "cmpxchg.trystore", F, ReleasedLoadBB);
   1038   auto ReleasingStoreBB =
   1039       BasicBlock::Create(Ctx, "cmpxchg.fencedstore", F, TryStoreBB);
   1040   auto StartBB = BasicBlock::Create(Ctx, "cmpxchg.start", F, ReleasingStoreBB);
   1041 
   1042   // This grabs the DebugLoc from CI
   1043   IRBuilder<> Builder(CI);
   1044 
   1045   // The split call above "helpfully" added a branch at the end of BB (to the
   1046   // wrong place), but we might want a fence too. It's easiest to just remove
   1047   // the branch entirely.
   1048   std::prev(BB->end())->eraseFromParent();
   1049   Builder.SetInsertPoint(BB);
   1050   if (ShouldInsertFencesForAtomic && UseUnconditionalReleaseBarrier)
   1051     TLI->emitLeadingFence(Builder, SuccessOrder, /*IsStore=*/true,
   1052                           /*IsLoad=*/true);
   1053   Builder.CreateBr(StartBB);
   1054 
   1055   // Start the main loop block now that we've taken care of the preliminaries.
   1056   Builder.SetInsertPoint(StartBB);
   1057   Value *UnreleasedLoad = TLI->emitLoadLinked(Builder, Addr, MemOpOrder);
   1058   Value *ShouldStore = Builder.CreateICmpEQ(
   1059       UnreleasedLoad, CI->getCompareOperand(), "should_store");
   1060 
   1061   // If the cmpxchg doesn't actually need any ordering when it fails, we can
   1062   // jump straight past that fence instruction (if it exists).
   1063   Builder.CreateCondBr(ShouldStore, ReleasingStoreBB, NoStoreBB);
   1064 
   1065   Builder.SetInsertPoint(ReleasingStoreBB);
   1066   if (ShouldInsertFencesForAtomic && !UseUnconditionalReleaseBarrier)
   1067     TLI->emitLeadingFence(Builder, SuccessOrder, /*IsStore=*/true,
   1068                           /*IsLoad=*/true);
   1069   Builder.CreateBr(TryStoreBB);
   1070 
   1071   Builder.SetInsertPoint(TryStoreBB);
   1072   Value *StoreSuccess = TLI->emitStoreConditional(
   1073       Builder, CI->getNewValOperand(), Addr, MemOpOrder);
   1074   StoreSuccess = Builder.CreateICmpEQ(
   1075       StoreSuccess, ConstantInt::get(Type::getInt32Ty(Ctx), 0), "success");
   1076   BasicBlock *RetryBB = HasReleasedLoadBB ? ReleasedLoadBB : StartBB;
   1077   Builder.CreateCondBr(StoreSuccess, SuccessBB,
   1078                        CI->isWeak() ? FailureBB : RetryBB);
   1079 
   1080   Builder.SetInsertPoint(ReleasedLoadBB);
   1081   Value *SecondLoad;
   1082   if (HasReleasedLoadBB) {
   1083     SecondLoad = TLI->emitLoadLinked(Builder, Addr, MemOpOrder);
   1084     ShouldStore = Builder.CreateICmpEQ(SecondLoad, CI->getCompareOperand(),
   1085                                        "should_store");
   1086 
   1087     // If the cmpxchg doesn't actually need any ordering when it fails, we can
   1088     // jump straight past that fence instruction (if it exists).
   1089     Builder.CreateCondBr(ShouldStore, TryStoreBB, NoStoreBB);
   1090   } else
   1091     Builder.CreateUnreachable();
   1092 
   1093   // Make sure later instructions don't get reordered with a fence if
   1094   // necessary.
   1095   Builder.SetInsertPoint(SuccessBB);
   1096   if (ShouldInsertFencesForAtomic)
   1097     TLI->emitTrailingFence(Builder, SuccessOrder, /*IsStore=*/true,
   1098                            /*IsLoad=*/true);
   1099   Builder.CreateBr(ExitBB);
   1100 
   1101   Builder.SetInsertPoint(NoStoreBB);
   1102   // In the failing case, where we don't execute the store-conditional, the
   1103   // target might want to balance out the load-linked with a dedicated
   1104   // instruction (e.g., on ARM, clearing the exclusive monitor).
   1105   TLI->emitAtomicCmpXchgNoStoreLLBalance(Builder);
   1106   Builder.CreateBr(FailureBB);
   1107 
   1108   Builder.SetInsertPoint(FailureBB);
   1109   if (ShouldInsertFencesForAtomic)
   1110     TLI->emitTrailingFence(Builder, FailureOrder, /*IsStore=*/true,
   1111                            /*IsLoad=*/true);
   1112   Builder.CreateBr(ExitBB);
   1113 
   1114   // Finally, we have control-flow based knowledge of whether the cmpxchg
   1115   // succeeded or not. We expose this to later passes by converting any
   1116   // subsequent "icmp eq/ne %loaded, %oldval" into a use of an appropriate
   1117   // PHI.
   1118   Builder.SetInsertPoint(ExitBB, ExitBB->begin());
   1119   PHINode *Success = Builder.CreatePHI(Type::getInt1Ty(Ctx), 2);
   1120   Success->addIncoming(ConstantInt::getTrue(Ctx), SuccessBB);
   1121   Success->addIncoming(ConstantInt::getFalse(Ctx), FailureBB);
   1122 
   1123   // Setup the builder so we can create any PHIs we need.
   1124   Value *Loaded;
   1125   if (!HasReleasedLoadBB)
   1126     Loaded = UnreleasedLoad;
   1127   else {
   1128     Builder.SetInsertPoint(TryStoreBB, TryStoreBB->begin());
   1129     PHINode *TryStoreLoaded = Builder.CreatePHI(UnreleasedLoad->getType(), 2);
   1130     TryStoreLoaded->addIncoming(UnreleasedLoad, ReleasingStoreBB);
   1131     TryStoreLoaded->addIncoming(SecondLoad, ReleasedLoadBB);
   1132 
   1133     Builder.SetInsertPoint(NoStoreBB, NoStoreBB->begin());
   1134     PHINode *NoStoreLoaded = Builder.CreatePHI(UnreleasedLoad->getType(), 2);
   1135     NoStoreLoaded->addIncoming(UnreleasedLoad, StartBB);
   1136     NoStoreLoaded->addIncoming(SecondLoad, ReleasedLoadBB);
   1137 
   1138     Builder.SetInsertPoint(ExitBB, ++ExitBB->begin());
   1139     PHINode *ExitLoaded = Builder.CreatePHI(UnreleasedLoad->getType(), 2);
   1140     ExitLoaded->addIncoming(TryStoreLoaded, SuccessBB);
   1141     ExitLoaded->addIncoming(NoStoreLoaded, FailureBB);
   1142 
   1143     Loaded = ExitLoaded;
   1144   }
   1145 
   1146   // Look for any users of the cmpxchg that are just comparing the loaded value
   1147   // against the desired one, and replace them with the CFG-derived version.
   1148   SmallVector<ExtractValueInst *, 2> PrunedInsts;
   1149   for (auto User : CI->users()) {
   1150     ExtractValueInst *EV = dyn_cast<ExtractValueInst>(User);
   1151     if (!EV)
   1152       continue;
   1153 
   1154     assert(EV->getNumIndices() == 1 && EV->getIndices()[0] <= 1 &&
   1155            "weird extraction from { iN, i1 }");
   1156 
   1157     if (EV->getIndices()[0] == 0)
   1158       EV->replaceAllUsesWith(Loaded);
   1159     else
   1160       EV->replaceAllUsesWith(Success);
   1161 
   1162     PrunedInsts.push_back(EV);
   1163   }
   1164 
   1165   // We can remove the instructions now we're no longer iterating through them.
   1166   for (auto EV : PrunedInsts)
   1167     EV->eraseFromParent();
   1168 
   1169   if (!CI->use_empty()) {
   1170     // Some use of the full struct return that we don't understand has happened,
   1171     // so we've got to reconstruct it properly.
   1172     Value *Res;
   1173     Res = Builder.CreateInsertValue(UndefValue::get(CI->getType()), Loaded, 0);
   1174     Res = Builder.CreateInsertValue(Res, Success, 1);
   1175 
   1176     CI->replaceAllUsesWith(Res);
   1177   }
   1178 
   1179   CI->eraseFromParent();
   1180   return true;
   1181 }
   1182 
   1183 bool AtomicExpand::isIdempotentRMW(AtomicRMWInst* RMWI) {
   1184   auto C = dyn_cast<ConstantInt>(RMWI->getValOperand());
   1185   if(!C)
   1186     return false;
   1187 
   1188   AtomicRMWInst::BinOp Op = RMWI->getOperation();
   1189   switch(Op) {
   1190     case AtomicRMWInst::Add:
   1191     case AtomicRMWInst::Sub:
   1192     case AtomicRMWInst::Or:
   1193     case AtomicRMWInst::Xor:
   1194       return C->isZero();
   1195     case AtomicRMWInst::And:
   1196       return C->isMinusOne();
   1197     // FIXME: we could also treat Min/Max/UMin/UMax by the INT_MIN/INT_MAX/...
   1198     default:
   1199       return false;
   1200   }
   1201 }
   1202 
   1203 bool AtomicExpand::simplifyIdempotentRMW(AtomicRMWInst* RMWI) {
   1204   if (auto ResultingLoad = TLI->lowerIdempotentRMWIntoFencedLoad(RMWI)) {
   1205     tryExpandAtomicLoad(ResultingLoad);
   1206     return true;
   1207   }
   1208   return false;
   1209 }
   1210 
   1211 Value *AtomicExpand::insertRMWCmpXchgLoop(
   1212     IRBuilder<> &Builder, Type *ResultTy, Value *Addr,
   1213     AtomicOrdering MemOpOrder,
   1214     function_ref<Value *(IRBuilder<> &, Value *)> PerformOp,
   1215     CreateCmpXchgInstFun CreateCmpXchg) {
   1216   LLVMContext &Ctx = Builder.getContext();
   1217   BasicBlock *BB = Builder.GetInsertBlock();
   1218   Function *F = BB->getParent();
   1219 
   1220   // Given: atomicrmw some_op iN* %addr, iN %incr ordering
   1221   //
   1222   // The standard expansion we produce is:
   1223   //     [...]
   1224   //     %init_loaded = load atomic iN* %addr
   1225   //     br label %loop
   1226   // loop:
   1227   //     %loaded = phi iN [ %init_loaded, %entry ], [ %new_loaded, %loop ]
   1228   //     %new = some_op iN %loaded, %incr
   1229   //     %pair = cmpxchg iN* %addr, iN %loaded, iN %new
   1230   //     %new_loaded = extractvalue { iN, i1 } %pair, 0
   1231   //     %success = extractvalue { iN, i1 } %pair, 1
   1232   //     br i1 %success, label %atomicrmw.end, label %loop
   1233   // atomicrmw.end:
   1234   //     [...]
   1235   BasicBlock *ExitBB =
   1236       BB->splitBasicBlock(Builder.GetInsertPoint(), "atomicrmw.end");
   1237   BasicBlock *LoopBB = BasicBlock::Create(Ctx, "atomicrmw.start", F, ExitBB);
   1238 
   1239   // The split call above "helpfully" added a branch at the end of BB (to the
   1240   // wrong place), but we want a load. It's easiest to just remove
   1241   // the branch entirely.
   1242   std::prev(BB->end())->eraseFromParent();
   1243   Builder.SetInsertPoint(BB);
   1244   LoadInst *InitLoaded = Builder.CreateLoad(ResultTy, Addr);
   1245   // Atomics require at least natural alignment.
   1246   InitLoaded->setAlignment(ResultTy->getPrimitiveSizeInBits() / 8);
   1247   Builder.CreateBr(LoopBB);
   1248 
   1249   // Start the main loop block now that we've taken care of the preliminaries.
   1250   Builder.SetInsertPoint(LoopBB);
   1251   PHINode *Loaded = Builder.CreatePHI(ResultTy, 2, "loaded");
   1252   Loaded->addIncoming(InitLoaded, BB);
   1253 
   1254   Value *NewVal = PerformOp(Builder, Loaded);
   1255 
   1256   Value *NewLoaded = nullptr;
   1257   Value *Success = nullptr;
   1258 
   1259   CreateCmpXchg(Builder, Addr, Loaded, NewVal,
   1260                 MemOpOrder == AtomicOrdering::Unordered
   1261                     ? AtomicOrdering::Monotonic
   1262                     : MemOpOrder,
   1263                 Success, NewLoaded);
   1264   assert(Success && NewLoaded);
   1265 
   1266   Loaded->addIncoming(NewLoaded, LoopBB);
   1267 
   1268   Builder.CreateCondBr(Success, ExitBB, LoopBB);
   1269 
   1270   Builder.SetInsertPoint(ExitBB, ExitBB->begin());
   1271   return NewLoaded;
   1272 }
   1273 
   1274 // Note: This function is exposed externally by AtomicExpandUtils.h
   1275 bool llvm::expandAtomicRMWToCmpXchg(AtomicRMWInst *AI,
   1276                                     CreateCmpXchgInstFun CreateCmpXchg) {
   1277   IRBuilder<> Builder(AI);
   1278   Value *Loaded = AtomicExpand::insertRMWCmpXchgLoop(
   1279       Builder, AI->getType(), AI->getPointerOperand(), AI->getOrdering(),
   1280       [&](IRBuilder<> &Builder, Value *Loaded) {
   1281         return performAtomicOp(AI->getOperation(), Builder, Loaded,
   1282                                AI->getValOperand());
   1283       },
   1284       CreateCmpXchg);
   1285 
   1286   AI->replaceAllUsesWith(Loaded);
   1287   AI->eraseFromParent();
   1288   return true;
   1289 }
   1290 
   1291 // In order to use one of the sized library calls such as
   1292 // __atomic_fetch_add_4, the alignment must be sufficient, the size
   1293 // must be one of the potentially-specialized sizes, and the value
   1294 // type must actually exist in C on the target (otherwise, the
   1295 // function wouldn't actually be defined.)
   1296 static bool canUseSizedAtomicCall(unsigned Size, unsigned Align,
   1297                                   const DataLayout &DL) {
   1298   // TODO: "LargestSize" is an approximation for "largest type that
   1299   // you can express in C". It seems to be the case that int128 is
   1300   // supported on all 64-bit platforms, otherwise only up to 64-bit
   1301   // integers are supported. If we get this wrong, then we'll try to
   1302   // call a sized libcall that doesn't actually exist. There should
   1303   // really be some more reliable way in LLVM of determining integer
   1304   // sizes which are valid in the target's C ABI...
   1305   unsigned LargestSize = DL.getLargestLegalIntTypeSizeInBits() >= 64 ? 16 : 8;
   1306   return Align >= Size &&
   1307          (Size == 1 || Size == 2 || Size == 4 || Size == 8 || Size == 16) &&
   1308          Size <= LargestSize;
   1309 }
   1310 
   1311 void AtomicExpand::expandAtomicLoadToLibcall(LoadInst *I) {
   1312   static const RTLIB::Libcall Libcalls[6] = {
   1313       RTLIB::ATOMIC_LOAD,   RTLIB::ATOMIC_LOAD_1, RTLIB::ATOMIC_LOAD_2,
   1314       RTLIB::ATOMIC_LOAD_4, RTLIB::ATOMIC_LOAD_8, RTLIB::ATOMIC_LOAD_16};
   1315   unsigned Size = getAtomicOpSize(I);
   1316   unsigned Align = getAtomicOpAlign(I);
   1317 
   1318   bool expanded = expandAtomicOpToLibcall(
   1319       I, Size, Align, I->getPointerOperand(), nullptr, nullptr,
   1320       I->getOrdering(), AtomicOrdering::NotAtomic, Libcalls);
   1321   (void)expanded;
   1322   assert(expanded && "expandAtomicOpToLibcall shouldn't fail tor Load");
   1323 }
   1324 
   1325 void AtomicExpand::expandAtomicStoreToLibcall(StoreInst *I) {
   1326   static const RTLIB::Libcall Libcalls[6] = {
   1327       RTLIB::ATOMIC_STORE,   RTLIB::ATOMIC_STORE_1, RTLIB::ATOMIC_STORE_2,
   1328       RTLIB::ATOMIC_STORE_4, RTLIB::ATOMIC_STORE_8, RTLIB::ATOMIC_STORE_16};
   1329   unsigned Size = getAtomicOpSize(I);
   1330   unsigned Align = getAtomicOpAlign(I);
   1331 
   1332   bool expanded = expandAtomicOpToLibcall(
   1333       I, Size, Align, I->getPointerOperand(), I->getValueOperand(), nullptr,
   1334       I->getOrdering(), AtomicOrdering::NotAtomic, Libcalls);
   1335   (void)expanded;
   1336   assert(expanded && "expandAtomicOpToLibcall shouldn't fail tor Store");
   1337 }
   1338 
   1339 void AtomicExpand::expandAtomicCASToLibcall(AtomicCmpXchgInst *I) {
   1340   static const RTLIB::Libcall Libcalls[6] = {
   1341       RTLIB::ATOMIC_COMPARE_EXCHANGE,   RTLIB::ATOMIC_COMPARE_EXCHANGE_1,
   1342       RTLIB::ATOMIC_COMPARE_EXCHANGE_2, RTLIB::ATOMIC_COMPARE_EXCHANGE_4,
   1343       RTLIB::ATOMIC_COMPARE_EXCHANGE_8, RTLIB::ATOMIC_COMPARE_EXCHANGE_16};
   1344   unsigned Size = getAtomicOpSize(I);
   1345   unsigned Align = getAtomicOpAlign(I);
   1346 
   1347   bool expanded = expandAtomicOpToLibcall(
   1348       I, Size, Align, I->getPointerOperand(), I->getNewValOperand(),
   1349       I->getCompareOperand(), I->getSuccessOrdering(), I->getFailureOrdering(),
   1350       Libcalls);
   1351   (void)expanded;
   1352   assert(expanded && "expandAtomicOpToLibcall shouldn't fail tor CAS");
   1353 }
   1354 
   1355 static ArrayRef<RTLIB::Libcall> GetRMWLibcall(AtomicRMWInst::BinOp Op) {
   1356   static const RTLIB::Libcall LibcallsXchg[6] = {
   1357       RTLIB::ATOMIC_EXCHANGE,   RTLIB::ATOMIC_EXCHANGE_1,
   1358       RTLIB::ATOMIC_EXCHANGE_2, RTLIB::ATOMIC_EXCHANGE_4,
   1359       RTLIB::ATOMIC_EXCHANGE_8, RTLIB::ATOMIC_EXCHANGE_16};
   1360   static const RTLIB::Libcall LibcallsAdd[6] = {
   1361       RTLIB::UNKNOWN_LIBCALL,    RTLIB::ATOMIC_FETCH_ADD_1,
   1362       RTLIB::ATOMIC_FETCH_ADD_2, RTLIB::ATOMIC_FETCH_ADD_4,
   1363       RTLIB::ATOMIC_FETCH_ADD_8, RTLIB::ATOMIC_FETCH_ADD_16};
   1364   static const RTLIB::Libcall LibcallsSub[6] = {
   1365       RTLIB::UNKNOWN_LIBCALL,    RTLIB::ATOMIC_FETCH_SUB_1,
   1366       RTLIB::ATOMIC_FETCH_SUB_2, RTLIB::ATOMIC_FETCH_SUB_4,
   1367       RTLIB::ATOMIC_FETCH_SUB_8, RTLIB::ATOMIC_FETCH_SUB_16};
   1368   static const RTLIB::Libcall LibcallsAnd[6] = {
   1369       RTLIB::UNKNOWN_LIBCALL,    RTLIB::ATOMIC_FETCH_AND_1,
   1370       RTLIB::ATOMIC_FETCH_AND_2, RTLIB::ATOMIC_FETCH_AND_4,
   1371       RTLIB::ATOMIC_FETCH_AND_8, RTLIB::ATOMIC_FETCH_AND_16};
   1372   static const RTLIB::Libcall LibcallsOr[6] = {
   1373       RTLIB::UNKNOWN_LIBCALL,   RTLIB::ATOMIC_FETCH_OR_1,
   1374       RTLIB::ATOMIC_FETCH_OR_2, RTLIB::ATOMIC_FETCH_OR_4,
   1375       RTLIB::ATOMIC_FETCH_OR_8, RTLIB::ATOMIC_FETCH_OR_16};
   1376   static const RTLIB::Libcall LibcallsXor[6] = {
   1377       RTLIB::UNKNOWN_LIBCALL,    RTLIB::ATOMIC_FETCH_XOR_1,
   1378       RTLIB::ATOMIC_FETCH_XOR_2, RTLIB::ATOMIC_FETCH_XOR_4,
   1379       RTLIB::ATOMIC_FETCH_XOR_8, RTLIB::ATOMIC_FETCH_XOR_16};
   1380   static const RTLIB::Libcall LibcallsNand[6] = {
   1381       RTLIB::UNKNOWN_LIBCALL,     RTLIB::ATOMIC_FETCH_NAND_1,
   1382       RTLIB::ATOMIC_FETCH_NAND_2, RTLIB::ATOMIC_FETCH_NAND_4,
   1383       RTLIB::ATOMIC_FETCH_NAND_8, RTLIB::ATOMIC_FETCH_NAND_16};
   1384 
   1385   switch (Op) {
   1386   case AtomicRMWInst::BAD_BINOP:
   1387     llvm_unreachable("Should not have BAD_BINOP.");
   1388   case AtomicRMWInst::Xchg:
   1389     return makeArrayRef(LibcallsXchg);
   1390   case AtomicRMWInst::Add:
   1391     return makeArrayRef(LibcallsAdd);
   1392   case AtomicRMWInst::Sub:
   1393     return makeArrayRef(LibcallsSub);
   1394   case AtomicRMWInst::And:
   1395     return makeArrayRef(LibcallsAnd);
   1396   case AtomicRMWInst::Or:
   1397     return makeArrayRef(LibcallsOr);
   1398   case AtomicRMWInst::Xor:
   1399     return makeArrayRef(LibcallsXor);
   1400   case AtomicRMWInst::Nand:
   1401     return makeArrayRef(LibcallsNand);
   1402   case AtomicRMWInst::Max:
   1403   case AtomicRMWInst::Min:
   1404   case AtomicRMWInst::UMax:
   1405   case AtomicRMWInst::UMin:
   1406     // No atomic libcalls are available for max/min/umax/umin.
   1407     return {};
   1408   }
   1409   llvm_unreachable("Unexpected AtomicRMW operation.");
   1410 }
   1411 
   1412 void AtomicExpand::expandAtomicRMWToLibcall(AtomicRMWInst *I) {
   1413   ArrayRef<RTLIB::Libcall> Libcalls = GetRMWLibcall(I->getOperation());
   1414 
   1415   unsigned Size = getAtomicOpSize(I);
   1416   unsigned Align = getAtomicOpAlign(I);
   1417 
   1418   bool Success = false;
   1419   if (!Libcalls.empty())
   1420     Success = expandAtomicOpToLibcall(
   1421         I, Size, Align, I->getPointerOperand(), I->getValOperand(), nullptr,
   1422         I->getOrdering(), AtomicOrdering::NotAtomic, Libcalls);
   1423 
   1424   // The expansion failed: either there were no libcalls at all for
   1425   // the operation (min/max), or there were only size-specialized
   1426   // libcalls (add/sub/etc) and we needed a generic. So, expand to a
   1427   // CAS libcall, via a CAS loop, instead.
   1428   if (!Success) {
   1429     expandAtomicRMWToCmpXchg(I, [this](IRBuilder<> &Builder, Value *Addr,
   1430                                        Value *Loaded, Value *NewVal,
   1431                                        AtomicOrdering MemOpOrder,
   1432                                        Value *&Success, Value *&NewLoaded) {
   1433       // Create the CAS instruction normally...
   1434       AtomicCmpXchgInst *Pair = Builder.CreateAtomicCmpXchg(
   1435           Addr, Loaded, NewVal, MemOpOrder,
   1436           AtomicCmpXchgInst::getStrongestFailureOrdering(MemOpOrder));
   1437       Success = Builder.CreateExtractValue(Pair, 1, "success");
   1438       NewLoaded = Builder.CreateExtractValue(Pair, 0, "newloaded");
   1439 
   1440       // ...and then expand the CAS into a libcall.
   1441       expandAtomicCASToLibcall(Pair);
   1442     });
   1443   }
   1444 }
   1445 
   1446 // A helper routine for the above expandAtomic*ToLibcall functions.
   1447 //
   1448 // 'Libcalls' contains an array of enum values for the particular
   1449 // ATOMIC libcalls to be emitted. All of the other arguments besides
   1450 // 'I' are extracted from the Instruction subclass by the
   1451 // caller. Depending on the particular call, some will be null.
   1452 bool AtomicExpand::expandAtomicOpToLibcall(
   1453     Instruction *I, unsigned Size, unsigned Align, Value *PointerOperand,
   1454     Value *ValueOperand, Value *CASExpected, AtomicOrdering Ordering,
   1455     AtomicOrdering Ordering2, ArrayRef<RTLIB::Libcall> Libcalls) {
   1456   assert(Libcalls.size() == 6);
   1457 
   1458   LLVMContext &Ctx = I->getContext();
   1459   Module *M = I->getModule();
   1460   const DataLayout &DL = M->getDataLayout();
   1461   IRBuilder<> Builder(I);
   1462   IRBuilder<> AllocaBuilder(&I->getFunction()->getEntryBlock().front());
   1463 
   1464   bool UseSizedLibcall = canUseSizedAtomicCall(Size, Align, DL);
   1465   Type *SizedIntTy = Type::getIntNTy(Ctx, Size * 8);
   1466 
   1467   unsigned AllocaAlignment = DL.getPrefTypeAlignment(SizedIntTy);
   1468 
   1469   // TODO: the "order" argument type is "int", not int32. So
   1470   // getInt32Ty may be wrong if the arch uses e.g. 16-bit ints.
   1471   ConstantInt *SizeVal64 = ConstantInt::get(Type::getInt64Ty(Ctx), Size);
   1472   assert(Ordering != AtomicOrdering::NotAtomic && "expect atomic MO");
   1473   Constant *OrderingVal =
   1474       ConstantInt::get(Type::getInt32Ty(Ctx), (int)toCABI(Ordering));
   1475   Constant *Ordering2Val = nullptr;
   1476   if (CASExpected) {
   1477     assert(Ordering2 != AtomicOrdering::NotAtomic && "expect atomic MO");
   1478     Ordering2Val =
   1479         ConstantInt::get(Type::getInt32Ty(Ctx), (int)toCABI(Ordering2));
   1480   }
   1481   bool HasResult = I->getType() != Type::getVoidTy(Ctx);
   1482 
   1483   RTLIB::Libcall RTLibType;
   1484   if (UseSizedLibcall) {
   1485     switch (Size) {
   1486     case 1: RTLibType = Libcalls[1]; break;
   1487     case 2: RTLibType = Libcalls[2]; break;
   1488     case 4: RTLibType = Libcalls[3]; break;
   1489     case 8: RTLibType = Libcalls[4]; break;
   1490     case 16: RTLibType = Libcalls[5]; break;
   1491     }
   1492   } else if (Libcalls[0] != RTLIB::UNKNOWN_LIBCALL) {
   1493     RTLibType = Libcalls[0];
   1494   } else {
   1495     // Can't use sized function, and there's no generic for this
   1496     // operation, so give up.
   1497     return false;
   1498   }
   1499 
   1500   // Build up the function call. There's two kinds. First, the sized
   1501   // variants.  These calls are going to be one of the following (with
   1502   // N=1,2,4,8,16):
   1503   //  iN    __atomic_load_N(iN *ptr, int ordering)
   1504   //  void  __atomic_store_N(iN *ptr, iN val, int ordering)
   1505   //  iN    __atomic_{exchange|fetch_*}_N(iN *ptr, iN val, int ordering)
   1506   //  bool  __atomic_compare_exchange_N(iN *ptr, iN *expected, iN desired,
   1507   //                                    int success_order, int failure_order)
   1508   //
   1509   // Note that these functions can be used for non-integer atomic
   1510   // operations, the values just need to be bitcast to integers on the
   1511   // way in and out.
   1512   //
   1513   // And, then, the generic variants. They look like the following:
   1514   //  void  __atomic_load(size_t size, void *ptr, void *ret, int ordering)
   1515   //  void  __atomic_store(size_t size, void *ptr, void *val, int ordering)
   1516   //  void  __atomic_exchange(size_t size, void *ptr, void *val, void *ret,
   1517   //                          int ordering)
   1518   //  bool  __atomic_compare_exchange(size_t size, void *ptr, void *expected,
   1519   //                                  void *desired, int success_order,
   1520   //                                  int failure_order)
   1521   //
   1522   // The different signatures are built up depending on the
   1523   // 'UseSizedLibcall', 'CASExpected', 'ValueOperand', and 'HasResult'
   1524   // variables.
   1525 
   1526   AllocaInst *AllocaCASExpected = nullptr;
   1527   Value *AllocaCASExpected_i8 = nullptr;
   1528   AllocaInst *AllocaValue = nullptr;
   1529   Value *AllocaValue_i8 = nullptr;
   1530   AllocaInst *AllocaResult = nullptr;
   1531   Value *AllocaResult_i8 = nullptr;
   1532 
   1533   Type *ResultTy;
   1534   SmallVector<Value *, 6> Args;
   1535   AttributeSet Attr;
   1536 
   1537   // 'size' argument.
   1538   if (!UseSizedLibcall) {
   1539     // Note, getIntPtrType is assumed equivalent to size_t.
   1540     Args.push_back(ConstantInt::get(DL.getIntPtrType(Ctx), Size));
   1541   }
   1542 
   1543   // 'ptr' argument.
   1544   Value *PtrVal =
   1545       Builder.CreateBitCast(PointerOperand, Type::getInt8PtrTy(Ctx));
   1546   Args.push_back(PtrVal);
   1547 
   1548   // 'expected' argument, if present.
   1549   if (CASExpected) {
   1550     AllocaCASExpected = AllocaBuilder.CreateAlloca(CASExpected->getType());
   1551     AllocaCASExpected->setAlignment(AllocaAlignment);
   1552     AllocaCASExpected_i8 =
   1553         Builder.CreateBitCast(AllocaCASExpected, Type::getInt8PtrTy(Ctx));
   1554     Builder.CreateLifetimeStart(AllocaCASExpected_i8, SizeVal64);
   1555     Builder.CreateAlignedStore(CASExpected, AllocaCASExpected, AllocaAlignment);
   1556     Args.push_back(AllocaCASExpected_i8);
   1557   }
   1558 
   1559   // 'val' argument ('desired' for cas), if present.
   1560   if (ValueOperand) {
   1561     if (UseSizedLibcall) {
   1562       Value *IntValue =
   1563           Builder.CreateBitOrPointerCast(ValueOperand, SizedIntTy);
   1564       Args.push_back(IntValue);
   1565     } else {
   1566       AllocaValue = AllocaBuilder.CreateAlloca(ValueOperand->getType());
   1567       AllocaValue->setAlignment(AllocaAlignment);
   1568       AllocaValue_i8 =
   1569           Builder.CreateBitCast(AllocaValue, Type::getInt8PtrTy(Ctx));
   1570       Builder.CreateLifetimeStart(AllocaValue_i8, SizeVal64);
   1571       Builder.CreateAlignedStore(ValueOperand, AllocaValue, AllocaAlignment);
   1572       Args.push_back(AllocaValue_i8);
   1573     }
   1574   }
   1575 
   1576   // 'ret' argument.
   1577   if (!CASExpected && HasResult && !UseSizedLibcall) {
   1578     AllocaResult = AllocaBuilder.CreateAlloca(I->getType());
   1579     AllocaResult->setAlignment(AllocaAlignment);
   1580     AllocaResult_i8 =
   1581         Builder.CreateBitCast(AllocaResult, Type::getInt8PtrTy(Ctx));
   1582     Builder.CreateLifetimeStart(AllocaResult_i8, SizeVal64);
   1583     Args.push_back(AllocaResult_i8);
   1584   }
   1585 
   1586   // 'ordering' ('success_order' for cas) argument.
   1587   Args.push_back(OrderingVal);
   1588 
   1589   // 'failure_order' argument, if present.
   1590   if (Ordering2Val)
   1591     Args.push_back(Ordering2Val);
   1592 
   1593   // Now, the return type.
   1594   if (CASExpected) {
   1595     ResultTy = Type::getInt1Ty(Ctx);
   1596     Attr = Attr.addAttribute(Ctx, AttributeSet::ReturnIndex, Attribute::ZExt);
   1597   } else if (HasResult && UseSizedLibcall)
   1598     ResultTy = SizedIntTy;
   1599   else
   1600     ResultTy = Type::getVoidTy(Ctx);
   1601 
   1602   // Done with setting up arguments and return types, create the call:
   1603   SmallVector<Type *, 6> ArgTys;
   1604   for (Value *Arg : Args)
   1605     ArgTys.push_back(Arg->getType());
   1606   FunctionType *FnType = FunctionType::get(ResultTy, ArgTys, false);
   1607   Constant *LibcallFn =
   1608       M->getOrInsertFunction(TLI->getLibcallName(RTLibType), FnType, Attr);
   1609   CallInst *Call = Builder.CreateCall(LibcallFn, Args);
   1610   Call->setAttributes(Attr);
   1611   Value *Result = Call;
   1612 
   1613   // And then, extract the results...
   1614   if (ValueOperand && !UseSizedLibcall)
   1615     Builder.CreateLifetimeEnd(AllocaValue_i8, SizeVal64);
   1616 
   1617   if (CASExpected) {
   1618     // The final result from the CAS is {load of 'expected' alloca, bool result
   1619     // from call}
   1620     Type *FinalResultTy = I->getType();
   1621     Value *V = UndefValue::get(FinalResultTy);
   1622     Value *ExpectedOut =
   1623         Builder.CreateAlignedLoad(AllocaCASExpected, AllocaAlignment);
   1624     Builder.CreateLifetimeEnd(AllocaCASExpected_i8, SizeVal64);
   1625     V = Builder.CreateInsertValue(V, ExpectedOut, 0);
   1626     V = Builder.CreateInsertValue(V, Result, 1);
   1627     I->replaceAllUsesWith(V);
   1628   } else if (HasResult) {
   1629     Value *V;
   1630     if (UseSizedLibcall)
   1631       V = Builder.CreateBitOrPointerCast(Result, I->getType());
   1632     else {
   1633       V = Builder.CreateAlignedLoad(AllocaResult, AllocaAlignment);
   1634       Builder.CreateLifetimeEnd(AllocaResult_i8, SizeVal64);
   1635     }
   1636     I->replaceAllUsesWith(V);
   1637   }
   1638   I->eraseFromParent();
   1639   return true;
   1640 }
   1641