Home | History | Annotate | Download | only in CodeGen
      1 //===-- AtomicExpandPass.cpp - Expand atomic instructions -------===//
      2 //
      3 //                     The LLVM Compiler Infrastructure
      4 //
      5 // This file is distributed under the University of Illinois Open Source
      6 // License. See LICENSE.TXT for details.
      7 //
      8 //===----------------------------------------------------------------------===//
      9 //
     10 // This file contains a pass (at IR level) to replace atomic instructions with
     11 // target specific instruction which implement the same semantics in a way
     12 // which better fits the target backend.  This can include the use of either
     13 // (intrinsic-based) load-linked/store-conditional loops, AtomicCmpXchg, or
     14 // type coercions.
     15 //
     16 //===----------------------------------------------------------------------===//
     17 
     18 #include "llvm/CodeGen/AtomicExpandUtils.h"
     19 #include "llvm/CodeGen/Passes.h"
     20 #include "llvm/IR/Function.h"
     21 #include "llvm/IR/IRBuilder.h"
     22 #include "llvm/IR/InstIterator.h"
     23 #include "llvm/IR/Instructions.h"
     24 #include "llvm/IR/Intrinsics.h"
     25 #include "llvm/IR/Module.h"
     26 #include "llvm/Support/Debug.h"
     27 #include "llvm/Support/raw_ostream.h"
     28 #include "llvm/Target/TargetLowering.h"
     29 #include "llvm/Target/TargetMachine.h"
     30 #include "llvm/Target/TargetSubtargetInfo.h"
     31 
     32 using namespace llvm;
     33 
     34 #define DEBUG_TYPE "atomic-expand"
     35 
     36 namespace {
     37   class AtomicExpand: public FunctionPass {
     38     const TargetMachine *TM;
     39     const TargetLowering *TLI;
     40   public:
     41     static char ID; // Pass identification, replacement for typeid
     42     explicit AtomicExpand(const TargetMachine *TM = nullptr)
     43       : FunctionPass(ID), TM(TM), TLI(nullptr) {
     44       initializeAtomicExpandPass(*PassRegistry::getPassRegistry());
     45     }
     46 
     47     bool runOnFunction(Function &F) override;
     48 
     49   private:
     50     bool bracketInstWithFences(Instruction *I, AtomicOrdering Order,
     51                                bool IsStore, bool IsLoad);
     52     IntegerType *getCorrespondingIntegerType(Type *T, const DataLayout &DL);
     53     LoadInst *convertAtomicLoadToIntegerType(LoadInst *LI);
     54     bool tryExpandAtomicLoad(LoadInst *LI);
     55     bool expandAtomicLoadToLL(LoadInst *LI);
     56     bool expandAtomicLoadToCmpXchg(LoadInst *LI);
     57     StoreInst *convertAtomicStoreToIntegerType(StoreInst *SI);
     58     bool expandAtomicStore(StoreInst *SI);
     59     bool tryExpandAtomicRMW(AtomicRMWInst *AI);
     60     bool expandAtomicOpToLLSC(
     61         Instruction *I, Value *Addr, AtomicOrdering MemOpOrder,
     62         std::function<Value *(IRBuilder<> &, Value *)> PerformOp);
     63     bool expandAtomicCmpXchg(AtomicCmpXchgInst *CI);
     64     bool isIdempotentRMW(AtomicRMWInst *AI);
     65     bool simplifyIdempotentRMW(AtomicRMWInst *AI);
     66   };
     67 }
     68 
     69 char AtomicExpand::ID = 0;
     70 char &llvm::AtomicExpandID = AtomicExpand::ID;
     71 INITIALIZE_TM_PASS(AtomicExpand, "atomic-expand",
     72     "Expand Atomic calls in terms of either load-linked & store-conditional or cmpxchg",
     73     false, false)
     74 
     75 FunctionPass *llvm::createAtomicExpandPass(const TargetMachine *TM) {
     76   return new AtomicExpand(TM);
     77 }
     78 
     79 bool AtomicExpand::runOnFunction(Function &F) {
     80   if (!TM || !TM->getSubtargetImpl(F)->enableAtomicExpand())
     81     return false;
     82   TLI = TM->getSubtargetImpl(F)->getTargetLowering();
     83 
     84   SmallVector<Instruction *, 1> AtomicInsts;
     85 
     86   // Changing control-flow while iterating through it is a bad idea, so gather a
     87   // list of all atomic instructions before we start.
     88   for (inst_iterator I = inst_begin(F), E = inst_end(F); I != E; ++I) {
     89     if (I->isAtomic())
     90       AtomicInsts.push_back(&*I);
     91   }
     92 
     93   bool MadeChange = false;
     94   for (auto I : AtomicInsts) {
     95     auto LI = dyn_cast<LoadInst>(I);
     96     auto SI = dyn_cast<StoreInst>(I);
     97     auto RMWI = dyn_cast<AtomicRMWInst>(I);
     98     auto CASI = dyn_cast<AtomicCmpXchgInst>(I);
     99     assert((LI || SI || RMWI || CASI || isa<FenceInst>(I)) &&
    100            "Unknown atomic instruction");
    101 
    102     auto FenceOrdering = Monotonic;
    103     bool IsStore, IsLoad;
    104     if (TLI->getInsertFencesForAtomic()) {
    105       if (LI && isAtLeastAcquire(LI->getOrdering())) {
    106         FenceOrdering = LI->getOrdering();
    107         LI->setOrdering(Monotonic);
    108         IsStore = false;
    109         IsLoad = true;
    110       } else if (SI && isAtLeastRelease(SI->getOrdering())) {
    111         FenceOrdering = SI->getOrdering();
    112         SI->setOrdering(Monotonic);
    113         IsStore = true;
    114         IsLoad = false;
    115       } else if (RMWI && (isAtLeastRelease(RMWI->getOrdering()) ||
    116                           isAtLeastAcquire(RMWI->getOrdering()))) {
    117         FenceOrdering = RMWI->getOrdering();
    118         RMWI->setOrdering(Monotonic);
    119         IsStore = IsLoad = true;
    120       } else if (CASI && !TLI->shouldExpandAtomicCmpXchgInIR(CASI) &&
    121                  (isAtLeastRelease(CASI->getSuccessOrdering()) ||
    122                   isAtLeastAcquire(CASI->getSuccessOrdering()))) {
    123         // If a compare and swap is lowered to LL/SC, we can do smarter fence
    124         // insertion, with a stronger one on the success path than on the
    125         // failure path. As a result, fence insertion is directly done by
    126         // expandAtomicCmpXchg in that case.
    127         FenceOrdering = CASI->getSuccessOrdering();
    128         CASI->setSuccessOrdering(Monotonic);
    129         CASI->setFailureOrdering(Monotonic);
    130         IsStore = IsLoad = true;
    131       }
    132 
    133       if (FenceOrdering != Monotonic) {
    134         MadeChange |= bracketInstWithFences(I, FenceOrdering, IsStore, IsLoad);
    135       }
    136     }
    137 
    138     if (LI) {
    139       if (LI->getType()->isFloatingPointTy()) {
    140         // TODO: add a TLI hook to control this so that each target can
    141         // convert to lowering the original type one at a time.
    142         LI = convertAtomicLoadToIntegerType(LI);
    143         assert(LI->getType()->isIntegerTy() && "invariant broken");
    144         MadeChange = true;
    145       }
    146 
    147       MadeChange |= tryExpandAtomicLoad(LI);
    148     } else if (SI) {
    149       if (SI->getValueOperand()->getType()->isFloatingPointTy()) {
    150         // TODO: add a TLI hook to control this so that each target can
    151         // convert to lowering the original type one at a time.
    152         SI = convertAtomicStoreToIntegerType(SI);
    153         assert(SI->getValueOperand()->getType()->isIntegerTy() &&
    154                "invariant broken");
    155         MadeChange = true;
    156       }
    157 
    158       if (TLI->shouldExpandAtomicStoreInIR(SI))
    159         MadeChange |= expandAtomicStore(SI);
    160     } else if (RMWI) {
    161       // There are two different ways of expanding RMW instructions:
    162       // - into a load if it is idempotent
    163       // - into a Cmpxchg/LL-SC loop otherwise
    164       // we try them in that order.
    165 
    166       if (isIdempotentRMW(RMWI) && simplifyIdempotentRMW(RMWI)) {
    167         MadeChange = true;
    168       } else {
    169         MadeChange |= tryExpandAtomicRMW(RMWI);
    170       }
    171     } else if (CASI && TLI->shouldExpandAtomicCmpXchgInIR(CASI)) {
    172       MadeChange |= expandAtomicCmpXchg(CASI);
    173     }
    174   }
    175   return MadeChange;
    176 }
    177 
    178 bool AtomicExpand::bracketInstWithFences(Instruction *I, AtomicOrdering Order,
    179                                          bool IsStore, bool IsLoad) {
    180   IRBuilder<> Builder(I);
    181 
    182   auto LeadingFence = TLI->emitLeadingFence(Builder, Order, IsStore, IsLoad);
    183 
    184   auto TrailingFence = TLI->emitTrailingFence(Builder, Order, IsStore, IsLoad);
    185   // The trailing fence is emitted before the instruction instead of after
    186   // because there is no easy way of setting Builder insertion point after
    187   // an instruction. So we must erase it from the BB, and insert it back
    188   // in the right place.
    189   // We have a guard here because not every atomic operation generates a
    190   // trailing fence.
    191   if (TrailingFence) {
    192     TrailingFence->removeFromParent();
    193     TrailingFence->insertAfter(I);
    194   }
    195 
    196   return (LeadingFence || TrailingFence);
    197 }
    198 
    199 /// Get the iX type with the same bitwidth as T.
    200 IntegerType *AtomicExpand::getCorrespondingIntegerType(Type *T,
    201                                                        const DataLayout &DL) {
    202   EVT VT = TLI->getValueType(DL, T);
    203   unsigned BitWidth = VT.getStoreSizeInBits();
    204   assert(BitWidth == VT.getSizeInBits() && "must be a power of two");
    205   return IntegerType::get(T->getContext(), BitWidth);
    206 }
    207 
    208 /// Convert an atomic load of a non-integral type to an integer load of the
    209 /// equivelent bitwidth.  See the function comment on
    210 /// convertAtomicStoreToIntegerType for background.
    211 LoadInst *AtomicExpand::convertAtomicLoadToIntegerType(LoadInst *LI) {
    212   auto *M = LI->getModule();
    213   Type *NewTy = getCorrespondingIntegerType(LI->getType(),
    214                                             M->getDataLayout());
    215 
    216   IRBuilder<> Builder(LI);
    217 
    218   Value *Addr = LI->getPointerOperand();
    219   Type *PT = PointerType::get(NewTy,
    220                               Addr->getType()->getPointerAddressSpace());
    221   Value *NewAddr = Builder.CreateBitCast(Addr, PT);
    222 
    223   auto *NewLI = Builder.CreateLoad(NewAddr);
    224   NewLI->setAlignment(LI->getAlignment());
    225   NewLI->setVolatile(LI->isVolatile());
    226   NewLI->setAtomic(LI->getOrdering(), LI->getSynchScope());
    227   DEBUG(dbgs() << "Replaced " << *LI << " with " << *NewLI << "\n");
    228 
    229   Value *NewVal = Builder.CreateBitCast(NewLI, LI->getType());
    230   LI->replaceAllUsesWith(NewVal);
    231   LI->eraseFromParent();
    232   return NewLI;
    233 }
    234 
    235 bool AtomicExpand::tryExpandAtomicLoad(LoadInst *LI) {
    236   switch (TLI->shouldExpandAtomicLoadInIR(LI)) {
    237   case TargetLoweringBase::AtomicExpansionKind::None:
    238     return false;
    239   case TargetLoweringBase::AtomicExpansionKind::LLSC:
    240     return expandAtomicOpToLLSC(
    241         LI, LI->getPointerOperand(), LI->getOrdering(),
    242         [](IRBuilder<> &Builder, Value *Loaded) { return Loaded; });
    243   case TargetLoweringBase::AtomicExpansionKind::LLOnly:
    244     return expandAtomicLoadToLL(LI);
    245   case TargetLoweringBase::AtomicExpansionKind::CmpXChg:
    246     return expandAtomicLoadToCmpXchg(LI);
    247   }
    248   llvm_unreachable("Unhandled case in tryExpandAtomicLoad");
    249 }
    250 
    251 bool AtomicExpand::expandAtomicLoadToLL(LoadInst *LI) {
    252   IRBuilder<> Builder(LI);
    253 
    254   // On some architectures, load-linked instructions are atomic for larger
    255   // sizes than normal loads. For example, the only 64-bit load guaranteed
    256   // to be single-copy atomic by ARM is an ldrexd (A3.5.3).
    257   Value *Val =
    258       TLI->emitLoadLinked(Builder, LI->getPointerOperand(), LI->getOrdering());
    259   TLI->emitAtomicCmpXchgNoStoreLLBalance(Builder);
    260 
    261   LI->replaceAllUsesWith(Val);
    262   LI->eraseFromParent();
    263 
    264   return true;
    265 }
    266 
    267 bool AtomicExpand::expandAtomicLoadToCmpXchg(LoadInst *LI) {
    268   IRBuilder<> Builder(LI);
    269   AtomicOrdering Order = LI->getOrdering();
    270   Value *Addr = LI->getPointerOperand();
    271   Type *Ty = cast<PointerType>(Addr->getType())->getElementType();
    272   Constant *DummyVal = Constant::getNullValue(Ty);
    273 
    274   Value *Pair = Builder.CreateAtomicCmpXchg(
    275       Addr, DummyVal, DummyVal, Order,
    276       AtomicCmpXchgInst::getStrongestFailureOrdering(Order));
    277   Value *Loaded = Builder.CreateExtractValue(Pair, 0, "loaded");
    278 
    279   LI->replaceAllUsesWith(Loaded);
    280   LI->eraseFromParent();
    281 
    282   return true;
    283 }
    284 
    285 /// Convert an atomic store of a non-integral type to an integer store of the
    286 /// equivelent bitwidth.  We used to not support floating point or vector
    287 /// atomics in the IR at all.  The backends learned to deal with the bitcast
    288 /// idiom because that was the only way of expressing the notion of a atomic
    289 /// float or vector store.  The long term plan is to teach each backend to
    290 /// instruction select from the original atomic store, but as a migration
    291 /// mechanism, we convert back to the old format which the backends understand.
    292 /// Each backend will need individual work to recognize the new format.
    293 StoreInst *AtomicExpand::convertAtomicStoreToIntegerType(StoreInst *SI) {
    294   IRBuilder<> Builder(SI);
    295   auto *M = SI->getModule();
    296   Type *NewTy = getCorrespondingIntegerType(SI->getValueOperand()->getType(),
    297                                             M->getDataLayout());
    298   Value *NewVal = Builder.CreateBitCast(SI->getValueOperand(), NewTy);
    299 
    300   Value *Addr = SI->getPointerOperand();
    301   Type *PT = PointerType::get(NewTy,
    302                               Addr->getType()->getPointerAddressSpace());
    303   Value *NewAddr = Builder.CreateBitCast(Addr, PT);
    304 
    305   StoreInst *NewSI = Builder.CreateStore(NewVal, NewAddr);
    306   NewSI->setAlignment(SI->getAlignment());
    307   NewSI->setVolatile(SI->isVolatile());
    308   NewSI->setAtomic(SI->getOrdering(), SI->getSynchScope());
    309   DEBUG(dbgs() << "Replaced " << *SI << " with " << *NewSI << "\n");
    310   SI->eraseFromParent();
    311   return NewSI;
    312 }
    313 
    314 bool AtomicExpand::expandAtomicStore(StoreInst *SI) {
    315   // This function is only called on atomic stores that are too large to be
    316   // atomic if implemented as a native store. So we replace them by an
    317   // atomic swap, that can be implemented for example as a ldrex/strex on ARM
    318   // or lock cmpxchg8/16b on X86, as these are atomic for larger sizes.
    319   // It is the responsibility of the target to only signal expansion via
    320   // shouldExpandAtomicRMW in cases where this is required and possible.
    321   IRBuilder<> Builder(SI);
    322   AtomicRMWInst *AI =
    323       Builder.CreateAtomicRMW(AtomicRMWInst::Xchg, SI->getPointerOperand(),
    324                               SI->getValueOperand(), SI->getOrdering());
    325   SI->eraseFromParent();
    326 
    327   // Now we have an appropriate swap instruction, lower it as usual.
    328   return tryExpandAtomicRMW(AI);
    329 }
    330 
    331 static void createCmpXchgInstFun(IRBuilder<> &Builder, Value *Addr,
    332                                  Value *Loaded, Value *NewVal,
    333                                  AtomicOrdering MemOpOrder,
    334                                  Value *&Success, Value *&NewLoaded) {
    335   Value* Pair = Builder.CreateAtomicCmpXchg(
    336       Addr, Loaded, NewVal, MemOpOrder,
    337       AtomicCmpXchgInst::getStrongestFailureOrdering(MemOpOrder));
    338   Success = Builder.CreateExtractValue(Pair, 1, "success");
    339   NewLoaded = Builder.CreateExtractValue(Pair, 0, "newloaded");
    340 }
    341 
    342 /// Emit IR to implement the given atomicrmw operation on values in registers,
    343 /// returning the new value.
    344 static Value *performAtomicOp(AtomicRMWInst::BinOp Op, IRBuilder<> &Builder,
    345                               Value *Loaded, Value *Inc) {
    346   Value *NewVal;
    347   switch (Op) {
    348   case AtomicRMWInst::Xchg:
    349     return Inc;
    350   case AtomicRMWInst::Add:
    351     return Builder.CreateAdd(Loaded, Inc, "new");
    352   case AtomicRMWInst::Sub:
    353     return Builder.CreateSub(Loaded, Inc, "new");
    354   case AtomicRMWInst::And:
    355     return Builder.CreateAnd(Loaded, Inc, "new");
    356   case AtomicRMWInst::Nand:
    357     return Builder.CreateNot(Builder.CreateAnd(Loaded, Inc), "new");
    358   case AtomicRMWInst::Or:
    359     return Builder.CreateOr(Loaded, Inc, "new");
    360   case AtomicRMWInst::Xor:
    361     return Builder.CreateXor(Loaded, Inc, "new");
    362   case AtomicRMWInst::Max:
    363     NewVal = Builder.CreateICmpSGT(Loaded, Inc);
    364     return Builder.CreateSelect(NewVal, Loaded, Inc, "new");
    365   case AtomicRMWInst::Min:
    366     NewVal = Builder.CreateICmpSLE(Loaded, Inc);
    367     return Builder.CreateSelect(NewVal, Loaded, Inc, "new");
    368   case AtomicRMWInst::UMax:
    369     NewVal = Builder.CreateICmpUGT(Loaded, Inc);
    370     return Builder.CreateSelect(NewVal, Loaded, Inc, "new");
    371   case AtomicRMWInst::UMin:
    372     NewVal = Builder.CreateICmpULE(Loaded, Inc);
    373     return Builder.CreateSelect(NewVal, Loaded, Inc, "new");
    374   default:
    375     llvm_unreachable("Unknown atomic op");
    376   }
    377 }
    378 
    379 bool AtomicExpand::tryExpandAtomicRMW(AtomicRMWInst *AI) {
    380   switch (TLI->shouldExpandAtomicRMWInIR(AI)) {
    381   case TargetLoweringBase::AtomicExpansionKind::None:
    382     return false;
    383   case TargetLoweringBase::AtomicExpansionKind::LLSC:
    384     return expandAtomicOpToLLSC(AI, AI->getPointerOperand(), AI->getOrdering(),
    385                                 [&](IRBuilder<> &Builder, Value *Loaded) {
    386                                   return performAtomicOp(AI->getOperation(),
    387                                                          Builder, Loaded,
    388                                                          AI->getValOperand());
    389                                 });
    390   case TargetLoweringBase::AtomicExpansionKind::CmpXChg:
    391     return expandAtomicRMWToCmpXchg(AI, createCmpXchgInstFun);
    392   default:
    393     llvm_unreachable("Unhandled case in tryExpandAtomicRMW");
    394   }
    395 }
    396 
    397 bool AtomicExpand::expandAtomicOpToLLSC(
    398     Instruction *I, Value *Addr, AtomicOrdering MemOpOrder,
    399     std::function<Value *(IRBuilder<> &, Value *)> PerformOp) {
    400   BasicBlock *BB = I->getParent();
    401   Function *F = BB->getParent();
    402   LLVMContext &Ctx = F->getContext();
    403 
    404   // Given: atomicrmw some_op iN* %addr, iN %incr ordering
    405   //
    406   // The standard expansion we produce is:
    407   //     [...]
    408   //     fence?
    409   // atomicrmw.start:
    410   //     %loaded = @load.linked(%addr)
    411   //     %new = some_op iN %loaded, %incr
    412   //     %stored = @store_conditional(%new, %addr)
    413   //     %try_again = icmp i32 ne %stored, 0
    414   //     br i1 %try_again, label %loop, label %atomicrmw.end
    415   // atomicrmw.end:
    416   //     fence?
    417   //     [...]
    418   BasicBlock *ExitBB = BB->splitBasicBlock(I->getIterator(), "atomicrmw.end");
    419   BasicBlock *LoopBB =  BasicBlock::Create(Ctx, "atomicrmw.start", F, ExitBB);
    420 
    421   // This grabs the DebugLoc from I.
    422   IRBuilder<> Builder(I);
    423 
    424   // The split call above "helpfully" added a branch at the end of BB (to the
    425   // wrong place), but we might want a fence too. It's easiest to just remove
    426   // the branch entirely.
    427   std::prev(BB->end())->eraseFromParent();
    428   Builder.SetInsertPoint(BB);
    429   Builder.CreateBr(LoopBB);
    430 
    431   // Start the main loop block now that we've taken care of the preliminaries.
    432   Builder.SetInsertPoint(LoopBB);
    433   Value *Loaded = TLI->emitLoadLinked(Builder, Addr, MemOpOrder);
    434 
    435   Value *NewVal = PerformOp(Builder, Loaded);
    436 
    437   Value *StoreSuccess =
    438       TLI->emitStoreConditional(Builder, NewVal, Addr, MemOpOrder);
    439   Value *TryAgain = Builder.CreateICmpNE(
    440       StoreSuccess, ConstantInt::get(IntegerType::get(Ctx, 32), 0), "tryagain");
    441   Builder.CreateCondBr(TryAgain, LoopBB, ExitBB);
    442 
    443   Builder.SetInsertPoint(ExitBB, ExitBB->begin());
    444 
    445   I->replaceAllUsesWith(Loaded);
    446   I->eraseFromParent();
    447 
    448   return true;
    449 }
    450 
    451 bool AtomicExpand::expandAtomicCmpXchg(AtomicCmpXchgInst *CI) {
    452   AtomicOrdering SuccessOrder = CI->getSuccessOrdering();
    453   AtomicOrdering FailureOrder = CI->getFailureOrdering();
    454   Value *Addr = CI->getPointerOperand();
    455   BasicBlock *BB = CI->getParent();
    456   Function *F = BB->getParent();
    457   LLVMContext &Ctx = F->getContext();
    458   // If getInsertFencesForAtomic() returns true, then the target does not want
    459   // to deal with memory orders, and emitLeading/TrailingFence should take care
    460   // of everything. Otherwise, emitLeading/TrailingFence are no-op and we
    461   // should preserve the ordering.
    462   AtomicOrdering MemOpOrder =
    463       TLI->getInsertFencesForAtomic() ? Monotonic : SuccessOrder;
    464 
    465   // Given: cmpxchg some_op iN* %addr, iN %desired, iN %new success_ord fail_ord
    466   //
    467   // The full expansion we produce is:
    468   //     [...]
    469   //     fence?
    470   // cmpxchg.start:
    471   //     %loaded = @load.linked(%addr)
    472   //     %should_store = icmp eq %loaded, %desired
    473   //     br i1 %should_store, label %cmpxchg.trystore,
    474   //                          label %cmpxchg.nostore
    475   // cmpxchg.trystore:
    476   //     %stored = @store_conditional(%new, %addr)
    477   //     %success = icmp eq i32 %stored, 0
    478   //     br i1 %success, label %cmpxchg.success, label %loop/%cmpxchg.failure
    479   // cmpxchg.success:
    480   //     fence?
    481   //     br label %cmpxchg.end
    482   // cmpxchg.nostore:
    483   //     @load_linked_fail_balance()?
    484   //     br label %cmpxchg.failure
    485   // cmpxchg.failure:
    486   //     fence?
    487   //     br label %cmpxchg.end
    488   // cmpxchg.end:
    489   //     %success = phi i1 [true, %cmpxchg.success], [false, %cmpxchg.failure]
    490   //     %restmp = insertvalue { iN, i1 } undef, iN %loaded, 0
    491   //     %res = insertvalue { iN, i1 } %restmp, i1 %success, 1
    492   //     [...]
    493   BasicBlock *ExitBB = BB->splitBasicBlock(CI->getIterator(), "cmpxchg.end");
    494   auto FailureBB = BasicBlock::Create(Ctx, "cmpxchg.failure", F, ExitBB);
    495   auto NoStoreBB = BasicBlock::Create(Ctx, "cmpxchg.nostore", F, FailureBB);
    496   auto SuccessBB = BasicBlock::Create(Ctx, "cmpxchg.success", F, NoStoreBB);
    497   auto TryStoreBB = BasicBlock::Create(Ctx, "cmpxchg.trystore", F, SuccessBB);
    498   auto LoopBB = BasicBlock::Create(Ctx, "cmpxchg.start", F, TryStoreBB);
    499 
    500   // This grabs the DebugLoc from CI
    501   IRBuilder<> Builder(CI);
    502 
    503   // The split call above "helpfully" added a branch at the end of BB (to the
    504   // wrong place), but we might want a fence too. It's easiest to just remove
    505   // the branch entirely.
    506   std::prev(BB->end())->eraseFromParent();
    507   Builder.SetInsertPoint(BB);
    508   TLI->emitLeadingFence(Builder, SuccessOrder, /*IsStore=*/true,
    509                         /*IsLoad=*/true);
    510   Builder.CreateBr(LoopBB);
    511 
    512   // Start the main loop block now that we've taken care of the preliminaries.
    513   Builder.SetInsertPoint(LoopBB);
    514   Value *Loaded = TLI->emitLoadLinked(Builder, Addr, MemOpOrder);
    515   Value *ShouldStore =
    516       Builder.CreateICmpEQ(Loaded, CI->getCompareOperand(), "should_store");
    517 
    518   // If the cmpxchg doesn't actually need any ordering when it fails, we can
    519   // jump straight past that fence instruction (if it exists).
    520   Builder.CreateCondBr(ShouldStore, TryStoreBB, NoStoreBB);
    521 
    522   Builder.SetInsertPoint(TryStoreBB);
    523   Value *StoreSuccess = TLI->emitStoreConditional(
    524       Builder, CI->getNewValOperand(), Addr, MemOpOrder);
    525   StoreSuccess = Builder.CreateICmpEQ(
    526       StoreSuccess, ConstantInt::get(Type::getInt32Ty(Ctx), 0), "success");
    527   Builder.CreateCondBr(StoreSuccess, SuccessBB,
    528                        CI->isWeak() ? FailureBB : LoopBB);
    529 
    530   // Make sure later instructions don't get reordered with a fence if necessary.
    531   Builder.SetInsertPoint(SuccessBB);
    532   TLI->emitTrailingFence(Builder, SuccessOrder, /*IsStore=*/true,
    533                          /*IsLoad=*/true);
    534   Builder.CreateBr(ExitBB);
    535 
    536   Builder.SetInsertPoint(NoStoreBB);
    537   // In the failing case, where we don't execute the store-conditional, the
    538   // target might want to balance out the load-linked with a dedicated
    539   // instruction (e.g., on ARM, clearing the exclusive monitor).
    540   TLI->emitAtomicCmpXchgNoStoreLLBalance(Builder);
    541   Builder.CreateBr(FailureBB);
    542 
    543   Builder.SetInsertPoint(FailureBB);
    544   TLI->emitTrailingFence(Builder, FailureOrder, /*IsStore=*/true,
    545                          /*IsLoad=*/true);
    546   Builder.CreateBr(ExitBB);
    547 
    548   // Finally, we have control-flow based knowledge of whether the cmpxchg
    549   // succeeded or not. We expose this to later passes by converting any
    550   // subsequent "icmp eq/ne %loaded, %oldval" into a use of an appropriate PHI.
    551 
    552   // Setup the builder so we can create any PHIs we need.
    553   Builder.SetInsertPoint(ExitBB, ExitBB->begin());
    554   PHINode *Success = Builder.CreatePHI(Type::getInt1Ty(Ctx), 2);
    555   Success->addIncoming(ConstantInt::getTrue(Ctx), SuccessBB);
    556   Success->addIncoming(ConstantInt::getFalse(Ctx), FailureBB);
    557 
    558   // Look for any users of the cmpxchg that are just comparing the loaded value
    559   // against the desired one, and replace them with the CFG-derived version.
    560   SmallVector<ExtractValueInst *, 2> PrunedInsts;
    561   for (auto User : CI->users()) {
    562     ExtractValueInst *EV = dyn_cast<ExtractValueInst>(User);
    563     if (!EV)
    564       continue;
    565 
    566     assert(EV->getNumIndices() == 1 && EV->getIndices()[0] <= 1 &&
    567            "weird extraction from { iN, i1 }");
    568 
    569     if (EV->getIndices()[0] == 0)
    570       EV->replaceAllUsesWith(Loaded);
    571     else
    572       EV->replaceAllUsesWith(Success);
    573 
    574     PrunedInsts.push_back(EV);
    575   }
    576 
    577   // We can remove the instructions now we're no longer iterating through them.
    578   for (auto EV : PrunedInsts)
    579     EV->eraseFromParent();
    580 
    581   if (!CI->use_empty()) {
    582     // Some use of the full struct return that we don't understand has happened,
    583     // so we've got to reconstruct it properly.
    584     Value *Res;
    585     Res = Builder.CreateInsertValue(UndefValue::get(CI->getType()), Loaded, 0);
    586     Res = Builder.CreateInsertValue(Res, Success, 1);
    587 
    588     CI->replaceAllUsesWith(Res);
    589   }
    590 
    591   CI->eraseFromParent();
    592   return true;
    593 }
    594 
    595 bool AtomicExpand::isIdempotentRMW(AtomicRMWInst* RMWI) {
    596   auto C = dyn_cast<ConstantInt>(RMWI->getValOperand());
    597   if(!C)
    598     return false;
    599 
    600   AtomicRMWInst::BinOp Op = RMWI->getOperation();
    601   switch(Op) {
    602     case AtomicRMWInst::Add:
    603     case AtomicRMWInst::Sub:
    604     case AtomicRMWInst::Or:
    605     case AtomicRMWInst::Xor:
    606       return C->isZero();
    607     case AtomicRMWInst::And:
    608       return C->isMinusOne();
    609     // FIXME: we could also treat Min/Max/UMin/UMax by the INT_MIN/INT_MAX/...
    610     default:
    611       return false;
    612   }
    613 }
    614 
    615 bool AtomicExpand::simplifyIdempotentRMW(AtomicRMWInst* RMWI) {
    616   if (auto ResultingLoad = TLI->lowerIdempotentRMWIntoFencedLoad(RMWI)) {
    617     tryExpandAtomicLoad(ResultingLoad);
    618     return true;
    619   }
    620   return false;
    621 }
    622 
    623 bool llvm::expandAtomicRMWToCmpXchg(AtomicRMWInst *AI,
    624                                     CreateCmpXchgInstFun CreateCmpXchg) {
    625   assert(AI);
    626 
    627   AtomicOrdering MemOpOrder =
    628       AI->getOrdering() == Unordered ? Monotonic : AI->getOrdering();
    629   Value *Addr = AI->getPointerOperand();
    630   BasicBlock *BB = AI->getParent();
    631   Function *F = BB->getParent();
    632   LLVMContext &Ctx = F->getContext();
    633 
    634   // Given: atomicrmw some_op iN* %addr, iN %incr ordering
    635   //
    636   // The standard expansion we produce is:
    637   //     [...]
    638   //     %init_loaded = load atomic iN* %addr
    639   //     br label %loop
    640   // loop:
    641   //     %loaded = phi iN [ %init_loaded, %entry ], [ %new_loaded, %loop ]
    642   //     %new = some_op iN %loaded, %incr
    643   //     %pair = cmpxchg iN* %addr, iN %loaded, iN %new
    644   //     %new_loaded = extractvalue { iN, i1 } %pair, 0
    645   //     %success = extractvalue { iN, i1 } %pair, 1
    646   //     br i1 %success, label %atomicrmw.end, label %loop
    647   // atomicrmw.end:
    648   //     [...]
    649   BasicBlock *ExitBB = BB->splitBasicBlock(AI->getIterator(), "atomicrmw.end");
    650   BasicBlock *LoopBB = BasicBlock::Create(Ctx, "atomicrmw.start", F, ExitBB);
    651 
    652   // This grabs the DebugLoc from AI.
    653   IRBuilder<> Builder(AI);
    654 
    655   // The split call above "helpfully" added a branch at the end of BB (to the
    656   // wrong place), but we want a load. It's easiest to just remove
    657   // the branch entirely.
    658   std::prev(BB->end())->eraseFromParent();
    659   Builder.SetInsertPoint(BB);
    660   LoadInst *InitLoaded = Builder.CreateLoad(Addr);
    661   // Atomics require at least natural alignment.
    662   InitLoaded->setAlignment(AI->getType()->getPrimitiveSizeInBits() / 8);
    663   Builder.CreateBr(LoopBB);
    664 
    665   // Start the main loop block now that we've taken care of the preliminaries.
    666   Builder.SetInsertPoint(LoopBB);
    667   PHINode *Loaded = Builder.CreatePHI(AI->getType(), 2, "loaded");
    668   Loaded->addIncoming(InitLoaded, BB);
    669 
    670   Value *NewVal =
    671       performAtomicOp(AI->getOperation(), Builder, Loaded, AI->getValOperand());
    672 
    673   Value *NewLoaded = nullptr;
    674   Value *Success = nullptr;
    675 
    676   CreateCmpXchg(Builder, Addr, Loaded, NewVal, MemOpOrder,
    677                 Success, NewLoaded);
    678   assert(Success && NewLoaded);
    679 
    680   Loaded->addIncoming(NewLoaded, LoopBB);
    681 
    682   Builder.CreateCondBr(Success, ExitBB, LoopBB);
    683 
    684   Builder.SetInsertPoint(ExitBB, ExitBB->begin());
    685 
    686   AI->replaceAllUsesWith(NewLoaded);
    687   AI->eraseFromParent();
    688 
    689   return true;
    690 }
    691