1 //===-- AtomicExpandPass.cpp - Expand atomic instructions -------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 // This file contains a pass (at IR level) to replace atomic instructions with 11 // __atomic_* library calls, or target specific instruction which implement the 12 // same semantics in a way which better fits the target backend. This can 13 // include the use of (intrinsic-based) load-linked/store-conditional loops, 14 // AtomicCmpXchg, or type coercions. 15 // 16 //===----------------------------------------------------------------------===// 17 18 #include "llvm/CodeGen/AtomicExpandUtils.h" 19 #include "llvm/CodeGen/Passes.h" 20 #include "llvm/IR/Function.h" 21 #include "llvm/IR/IRBuilder.h" 22 #include "llvm/IR/InstIterator.h" 23 #include "llvm/IR/Instructions.h" 24 #include "llvm/IR/Intrinsics.h" 25 #include "llvm/IR/Module.h" 26 #include "llvm/Support/Debug.h" 27 #include "llvm/Support/raw_ostream.h" 28 #include "llvm/Target/TargetLowering.h" 29 #include "llvm/Target/TargetMachine.h" 30 #include "llvm/Target/TargetSubtargetInfo.h" 31 32 using namespace llvm; 33 34 #define DEBUG_TYPE "atomic-expand" 35 36 namespace { 37 class AtomicExpand: public FunctionPass { 38 const TargetMachine *TM; 39 const TargetLowering *TLI; 40 public: 41 static char ID; // Pass identification, replacement for typeid 42 explicit AtomicExpand(const TargetMachine *TM = nullptr) 43 : FunctionPass(ID), TM(TM), TLI(nullptr) { 44 initializeAtomicExpandPass(*PassRegistry::getPassRegistry()); 45 } 46 47 bool runOnFunction(Function &F) override; 48 49 private: 50 bool bracketInstWithFences(Instruction *I, AtomicOrdering Order, 51 bool IsStore, bool IsLoad); 52 IntegerType *getCorrespondingIntegerType(Type *T, const DataLayout &DL); 53 LoadInst *convertAtomicLoadToIntegerType(LoadInst *LI); 54 bool tryExpandAtomicLoad(LoadInst *LI); 55 bool expandAtomicLoadToLL(LoadInst *LI); 56 bool expandAtomicLoadToCmpXchg(LoadInst *LI); 57 StoreInst *convertAtomicStoreToIntegerType(StoreInst *SI); 58 bool expandAtomicStore(StoreInst *SI); 59 bool tryExpandAtomicRMW(AtomicRMWInst *AI); 60 Value * 61 insertRMWLLSCLoop(IRBuilder<> &Builder, Type *ResultTy, Value *Addr, 62 AtomicOrdering MemOpOrder, 63 function_ref<Value *(IRBuilder<> &, Value *)> PerformOp); 64 void expandAtomicOpToLLSC( 65 Instruction *I, Type *ResultTy, Value *Addr, AtomicOrdering MemOpOrder, 66 function_ref<Value *(IRBuilder<> &, Value *)> PerformOp); 67 void expandPartwordAtomicRMW( 68 AtomicRMWInst *I, 69 TargetLoweringBase::AtomicExpansionKind ExpansionKind); 70 void expandPartwordCmpXchg(AtomicCmpXchgInst *I); 71 72 AtomicCmpXchgInst *convertCmpXchgToIntegerType(AtomicCmpXchgInst *CI); 73 static Value *insertRMWCmpXchgLoop( 74 IRBuilder<> &Builder, Type *ResultType, Value *Addr, 75 AtomicOrdering MemOpOrder, 76 function_ref<Value *(IRBuilder<> &, Value *)> PerformOp, 77 CreateCmpXchgInstFun CreateCmpXchg); 78 79 bool expandAtomicCmpXchg(AtomicCmpXchgInst *CI); 80 bool isIdempotentRMW(AtomicRMWInst *AI); 81 bool simplifyIdempotentRMW(AtomicRMWInst *AI); 82 83 bool expandAtomicOpToLibcall(Instruction *I, unsigned Size, unsigned Align, 84 Value *PointerOperand, Value *ValueOperand, 85 Value *CASExpected, AtomicOrdering Ordering, 86 AtomicOrdering Ordering2, 87 ArrayRef<RTLIB::Libcall> Libcalls); 88 void expandAtomicLoadToLibcall(LoadInst *LI); 89 void expandAtomicStoreToLibcall(StoreInst *LI); 90 void expandAtomicRMWToLibcall(AtomicRMWInst *I); 91 void expandAtomicCASToLibcall(AtomicCmpXchgInst *I); 92 93 friend bool 94 llvm::expandAtomicRMWToCmpXchg(AtomicRMWInst *AI, 95 CreateCmpXchgInstFun CreateCmpXchg); 96 }; 97 } 98 99 char AtomicExpand::ID = 0; 100 char &llvm::AtomicExpandID = AtomicExpand::ID; 101 INITIALIZE_TM_PASS(AtomicExpand, "atomic-expand", "Expand Atomic instructions", 102 false, false) 103 104 FunctionPass *llvm::createAtomicExpandPass(const TargetMachine *TM) { 105 return new AtomicExpand(TM); 106 } 107 108 namespace { 109 // Helper functions to retrieve the size of atomic instructions. 110 unsigned getAtomicOpSize(LoadInst *LI) { 111 const DataLayout &DL = LI->getModule()->getDataLayout(); 112 return DL.getTypeStoreSize(LI->getType()); 113 } 114 115 unsigned getAtomicOpSize(StoreInst *SI) { 116 const DataLayout &DL = SI->getModule()->getDataLayout(); 117 return DL.getTypeStoreSize(SI->getValueOperand()->getType()); 118 } 119 120 unsigned getAtomicOpSize(AtomicRMWInst *RMWI) { 121 const DataLayout &DL = RMWI->getModule()->getDataLayout(); 122 return DL.getTypeStoreSize(RMWI->getValOperand()->getType()); 123 } 124 125 unsigned getAtomicOpSize(AtomicCmpXchgInst *CASI) { 126 const DataLayout &DL = CASI->getModule()->getDataLayout(); 127 return DL.getTypeStoreSize(CASI->getCompareOperand()->getType()); 128 } 129 130 // Helper functions to retrieve the alignment of atomic instructions. 131 unsigned getAtomicOpAlign(LoadInst *LI) { 132 unsigned Align = LI->getAlignment(); 133 // In the future, if this IR restriction is relaxed, we should 134 // return DataLayout::getABITypeAlignment when there's no align 135 // value. 136 assert(Align != 0 && "An atomic LoadInst always has an explicit alignment"); 137 return Align; 138 } 139 140 unsigned getAtomicOpAlign(StoreInst *SI) { 141 unsigned Align = SI->getAlignment(); 142 // In the future, if this IR restriction is relaxed, we should 143 // return DataLayout::getABITypeAlignment when there's no align 144 // value. 145 assert(Align != 0 && "An atomic StoreInst always has an explicit alignment"); 146 return Align; 147 } 148 149 unsigned getAtomicOpAlign(AtomicRMWInst *RMWI) { 150 // TODO(PR27168): This instruction has no alignment attribute, but unlike the 151 // default alignment for load/store, the default here is to assume 152 // it has NATURAL alignment, not DataLayout-specified alignment. 153 const DataLayout &DL = RMWI->getModule()->getDataLayout(); 154 return DL.getTypeStoreSize(RMWI->getValOperand()->getType()); 155 } 156 157 unsigned getAtomicOpAlign(AtomicCmpXchgInst *CASI) { 158 // TODO(PR27168): same comment as above. 159 const DataLayout &DL = CASI->getModule()->getDataLayout(); 160 return DL.getTypeStoreSize(CASI->getCompareOperand()->getType()); 161 } 162 163 // Determine if a particular atomic operation has a supported size, 164 // and is of appropriate alignment, to be passed through for target 165 // lowering. (Versus turning into a __atomic libcall) 166 template <typename Inst> 167 bool atomicSizeSupported(const TargetLowering *TLI, Inst *I) { 168 unsigned Size = getAtomicOpSize(I); 169 unsigned Align = getAtomicOpAlign(I); 170 return Align >= Size && Size <= TLI->getMaxAtomicSizeInBitsSupported() / 8; 171 } 172 173 } // end anonymous namespace 174 175 bool AtomicExpand::runOnFunction(Function &F) { 176 if (!TM || !TM->getSubtargetImpl(F)->enableAtomicExpand()) 177 return false; 178 TLI = TM->getSubtargetImpl(F)->getTargetLowering(); 179 180 SmallVector<Instruction *, 1> AtomicInsts; 181 182 // Changing control-flow while iterating through it is a bad idea, so gather a 183 // list of all atomic instructions before we start. 184 for (inst_iterator II = inst_begin(F), E = inst_end(F); II != E; ++II) { 185 Instruction *I = &*II; 186 if (I->isAtomic() && !isa<FenceInst>(I)) 187 AtomicInsts.push_back(I); 188 } 189 190 bool MadeChange = false; 191 for (auto I : AtomicInsts) { 192 auto LI = dyn_cast<LoadInst>(I); 193 auto SI = dyn_cast<StoreInst>(I); 194 auto RMWI = dyn_cast<AtomicRMWInst>(I); 195 auto CASI = dyn_cast<AtomicCmpXchgInst>(I); 196 assert((LI || SI || RMWI || CASI) && "Unknown atomic instruction"); 197 198 // If the Size/Alignment is not supported, replace with a libcall. 199 if (LI) { 200 if (!atomicSizeSupported(TLI, LI)) { 201 expandAtomicLoadToLibcall(LI); 202 MadeChange = true; 203 continue; 204 } 205 } else if (SI) { 206 if (!atomicSizeSupported(TLI, SI)) { 207 expandAtomicStoreToLibcall(SI); 208 MadeChange = true; 209 continue; 210 } 211 } else if (RMWI) { 212 if (!atomicSizeSupported(TLI, RMWI)) { 213 expandAtomicRMWToLibcall(RMWI); 214 MadeChange = true; 215 continue; 216 } 217 } else if (CASI) { 218 if (!atomicSizeSupported(TLI, CASI)) { 219 expandAtomicCASToLibcall(CASI); 220 MadeChange = true; 221 continue; 222 } 223 } 224 225 if (TLI->shouldInsertFencesForAtomic(I)) { 226 auto FenceOrdering = AtomicOrdering::Monotonic; 227 bool IsStore, IsLoad; 228 if (LI && isAcquireOrStronger(LI->getOrdering())) { 229 FenceOrdering = LI->getOrdering(); 230 LI->setOrdering(AtomicOrdering::Monotonic); 231 IsStore = false; 232 IsLoad = true; 233 } else if (SI && isReleaseOrStronger(SI->getOrdering())) { 234 FenceOrdering = SI->getOrdering(); 235 SI->setOrdering(AtomicOrdering::Monotonic); 236 IsStore = true; 237 IsLoad = false; 238 } else if (RMWI && (isReleaseOrStronger(RMWI->getOrdering()) || 239 isAcquireOrStronger(RMWI->getOrdering()))) { 240 FenceOrdering = RMWI->getOrdering(); 241 RMWI->setOrdering(AtomicOrdering::Monotonic); 242 IsStore = IsLoad = true; 243 } else if (CASI && !TLI->shouldExpandAtomicCmpXchgInIR(CASI) && 244 (isReleaseOrStronger(CASI->getSuccessOrdering()) || 245 isAcquireOrStronger(CASI->getSuccessOrdering()))) { 246 // If a compare and swap is lowered to LL/SC, we can do smarter fence 247 // insertion, with a stronger one on the success path than on the 248 // failure path. As a result, fence insertion is directly done by 249 // expandAtomicCmpXchg in that case. 250 FenceOrdering = CASI->getSuccessOrdering(); 251 CASI->setSuccessOrdering(AtomicOrdering::Monotonic); 252 CASI->setFailureOrdering(AtomicOrdering::Monotonic); 253 IsStore = IsLoad = true; 254 } 255 256 if (FenceOrdering != AtomicOrdering::Monotonic) { 257 MadeChange |= bracketInstWithFences(I, FenceOrdering, IsStore, IsLoad); 258 } 259 } 260 261 if (LI) { 262 if (LI->getType()->isFloatingPointTy()) { 263 // TODO: add a TLI hook to control this so that each target can 264 // convert to lowering the original type one at a time. 265 LI = convertAtomicLoadToIntegerType(LI); 266 assert(LI->getType()->isIntegerTy() && "invariant broken"); 267 MadeChange = true; 268 } 269 270 MadeChange |= tryExpandAtomicLoad(LI); 271 } else if (SI) { 272 if (SI->getValueOperand()->getType()->isFloatingPointTy()) { 273 // TODO: add a TLI hook to control this so that each target can 274 // convert to lowering the original type one at a time. 275 SI = convertAtomicStoreToIntegerType(SI); 276 assert(SI->getValueOperand()->getType()->isIntegerTy() && 277 "invariant broken"); 278 MadeChange = true; 279 } 280 281 if (TLI->shouldExpandAtomicStoreInIR(SI)) 282 MadeChange |= expandAtomicStore(SI); 283 } else if (RMWI) { 284 // There are two different ways of expanding RMW instructions: 285 // - into a load if it is idempotent 286 // - into a Cmpxchg/LL-SC loop otherwise 287 // we try them in that order. 288 289 if (isIdempotentRMW(RMWI) && simplifyIdempotentRMW(RMWI)) { 290 MadeChange = true; 291 } else { 292 MadeChange |= tryExpandAtomicRMW(RMWI); 293 } 294 } else if (CASI) { 295 // TODO: when we're ready to make the change at the IR level, we can 296 // extend convertCmpXchgToInteger for floating point too. 297 assert(!CASI->getCompareOperand()->getType()->isFloatingPointTy() && 298 "unimplemented - floating point not legal at IR level"); 299 if (CASI->getCompareOperand()->getType()->isPointerTy() ) { 300 // TODO: add a TLI hook to control this so that each target can 301 // convert to lowering the original type one at a time. 302 CASI = convertCmpXchgToIntegerType(CASI); 303 assert(CASI->getCompareOperand()->getType()->isIntegerTy() && 304 "invariant broken"); 305 MadeChange = true; 306 } 307 308 unsigned MinCASSize = TLI->getMinCmpXchgSizeInBits() / 8; 309 unsigned ValueSize = getAtomicOpSize(CASI); 310 if (ValueSize < MinCASSize) { 311 assert(!TLI->shouldExpandAtomicCmpXchgInIR(CASI) && 312 "MinCmpXchgSizeInBits not yet supported for LL/SC expansions."); 313 expandPartwordCmpXchg(CASI); 314 } else { 315 if (TLI->shouldExpandAtomicCmpXchgInIR(CASI)) 316 MadeChange |= expandAtomicCmpXchg(CASI); 317 } 318 } 319 } 320 return MadeChange; 321 } 322 323 bool AtomicExpand::bracketInstWithFences(Instruction *I, AtomicOrdering Order, 324 bool IsStore, bool IsLoad) { 325 IRBuilder<> Builder(I); 326 327 auto LeadingFence = TLI->emitLeadingFence(Builder, Order, IsStore, IsLoad); 328 329 auto TrailingFence = TLI->emitTrailingFence(Builder, Order, IsStore, IsLoad); 330 // The trailing fence is emitted before the instruction instead of after 331 // because there is no easy way of setting Builder insertion point after 332 // an instruction. So we must erase it from the BB, and insert it back 333 // in the right place. 334 // We have a guard here because not every atomic operation generates a 335 // trailing fence. 336 if (TrailingFence) { 337 TrailingFence->removeFromParent(); 338 TrailingFence->insertAfter(I); 339 } 340 341 return (LeadingFence || TrailingFence); 342 } 343 344 /// Get the iX type with the same bitwidth as T. 345 IntegerType *AtomicExpand::getCorrespondingIntegerType(Type *T, 346 const DataLayout &DL) { 347 EVT VT = TLI->getValueType(DL, T); 348 unsigned BitWidth = VT.getStoreSizeInBits(); 349 assert(BitWidth == VT.getSizeInBits() && "must be a power of two"); 350 return IntegerType::get(T->getContext(), BitWidth); 351 } 352 353 /// Convert an atomic load of a non-integral type to an integer load of the 354 /// equivalent bitwidth. See the function comment on 355 /// convertAtomicStoreToIntegerType for background. 356 LoadInst *AtomicExpand::convertAtomicLoadToIntegerType(LoadInst *LI) { 357 auto *M = LI->getModule(); 358 Type *NewTy = getCorrespondingIntegerType(LI->getType(), 359 M->getDataLayout()); 360 361 IRBuilder<> Builder(LI); 362 363 Value *Addr = LI->getPointerOperand(); 364 Type *PT = PointerType::get(NewTy, 365 Addr->getType()->getPointerAddressSpace()); 366 Value *NewAddr = Builder.CreateBitCast(Addr, PT); 367 368 auto *NewLI = Builder.CreateLoad(NewAddr); 369 NewLI->setAlignment(LI->getAlignment()); 370 NewLI->setVolatile(LI->isVolatile()); 371 NewLI->setAtomic(LI->getOrdering(), LI->getSynchScope()); 372 DEBUG(dbgs() << "Replaced " << *LI << " with " << *NewLI << "\n"); 373 374 Value *NewVal = Builder.CreateBitCast(NewLI, LI->getType()); 375 LI->replaceAllUsesWith(NewVal); 376 LI->eraseFromParent(); 377 return NewLI; 378 } 379 380 bool AtomicExpand::tryExpandAtomicLoad(LoadInst *LI) { 381 switch (TLI->shouldExpandAtomicLoadInIR(LI)) { 382 case TargetLoweringBase::AtomicExpansionKind::None: 383 return false; 384 case TargetLoweringBase::AtomicExpansionKind::LLSC: 385 expandAtomicOpToLLSC( 386 LI, LI->getType(), LI->getPointerOperand(), LI->getOrdering(), 387 [](IRBuilder<> &Builder, Value *Loaded) { return Loaded; }); 388 return true; 389 case TargetLoweringBase::AtomicExpansionKind::LLOnly: 390 return expandAtomicLoadToLL(LI); 391 case TargetLoweringBase::AtomicExpansionKind::CmpXChg: 392 return expandAtomicLoadToCmpXchg(LI); 393 } 394 llvm_unreachable("Unhandled case in tryExpandAtomicLoad"); 395 } 396 397 bool AtomicExpand::expandAtomicLoadToLL(LoadInst *LI) { 398 IRBuilder<> Builder(LI); 399 400 // On some architectures, load-linked instructions are atomic for larger 401 // sizes than normal loads. For example, the only 64-bit load guaranteed 402 // to be single-copy atomic by ARM is an ldrexd (A3.5.3). 403 Value *Val = 404 TLI->emitLoadLinked(Builder, LI->getPointerOperand(), LI->getOrdering()); 405 TLI->emitAtomicCmpXchgNoStoreLLBalance(Builder); 406 407 LI->replaceAllUsesWith(Val); 408 LI->eraseFromParent(); 409 410 return true; 411 } 412 413 bool AtomicExpand::expandAtomicLoadToCmpXchg(LoadInst *LI) { 414 IRBuilder<> Builder(LI); 415 AtomicOrdering Order = LI->getOrdering(); 416 Value *Addr = LI->getPointerOperand(); 417 Type *Ty = cast<PointerType>(Addr->getType())->getElementType(); 418 Constant *DummyVal = Constant::getNullValue(Ty); 419 420 Value *Pair = Builder.CreateAtomicCmpXchg( 421 Addr, DummyVal, DummyVal, Order, 422 AtomicCmpXchgInst::getStrongestFailureOrdering(Order)); 423 Value *Loaded = Builder.CreateExtractValue(Pair, 0, "loaded"); 424 425 LI->replaceAllUsesWith(Loaded); 426 LI->eraseFromParent(); 427 428 return true; 429 } 430 431 /// Convert an atomic store of a non-integral type to an integer store of the 432 /// equivalent bitwidth. We used to not support floating point or vector 433 /// atomics in the IR at all. The backends learned to deal with the bitcast 434 /// idiom because that was the only way of expressing the notion of a atomic 435 /// float or vector store. The long term plan is to teach each backend to 436 /// instruction select from the original atomic store, but as a migration 437 /// mechanism, we convert back to the old format which the backends understand. 438 /// Each backend will need individual work to recognize the new format. 439 StoreInst *AtomicExpand::convertAtomicStoreToIntegerType(StoreInst *SI) { 440 IRBuilder<> Builder(SI); 441 auto *M = SI->getModule(); 442 Type *NewTy = getCorrespondingIntegerType(SI->getValueOperand()->getType(), 443 M->getDataLayout()); 444 Value *NewVal = Builder.CreateBitCast(SI->getValueOperand(), NewTy); 445 446 Value *Addr = SI->getPointerOperand(); 447 Type *PT = PointerType::get(NewTy, 448 Addr->getType()->getPointerAddressSpace()); 449 Value *NewAddr = Builder.CreateBitCast(Addr, PT); 450 451 StoreInst *NewSI = Builder.CreateStore(NewVal, NewAddr); 452 NewSI->setAlignment(SI->getAlignment()); 453 NewSI->setVolatile(SI->isVolatile()); 454 NewSI->setAtomic(SI->getOrdering(), SI->getSynchScope()); 455 DEBUG(dbgs() << "Replaced " << *SI << " with " << *NewSI << "\n"); 456 SI->eraseFromParent(); 457 return NewSI; 458 } 459 460 bool AtomicExpand::expandAtomicStore(StoreInst *SI) { 461 // This function is only called on atomic stores that are too large to be 462 // atomic if implemented as a native store. So we replace them by an 463 // atomic swap, that can be implemented for example as a ldrex/strex on ARM 464 // or lock cmpxchg8/16b on X86, as these are atomic for larger sizes. 465 // It is the responsibility of the target to only signal expansion via 466 // shouldExpandAtomicRMW in cases where this is required and possible. 467 IRBuilder<> Builder(SI); 468 AtomicRMWInst *AI = 469 Builder.CreateAtomicRMW(AtomicRMWInst::Xchg, SI->getPointerOperand(), 470 SI->getValueOperand(), SI->getOrdering()); 471 SI->eraseFromParent(); 472 473 // Now we have an appropriate swap instruction, lower it as usual. 474 return tryExpandAtomicRMW(AI); 475 } 476 477 static void createCmpXchgInstFun(IRBuilder<> &Builder, Value *Addr, 478 Value *Loaded, Value *NewVal, 479 AtomicOrdering MemOpOrder, 480 Value *&Success, Value *&NewLoaded) { 481 Value* Pair = Builder.CreateAtomicCmpXchg( 482 Addr, Loaded, NewVal, MemOpOrder, 483 AtomicCmpXchgInst::getStrongestFailureOrdering(MemOpOrder)); 484 Success = Builder.CreateExtractValue(Pair, 1, "success"); 485 NewLoaded = Builder.CreateExtractValue(Pair, 0, "newloaded"); 486 } 487 488 /// Emit IR to implement the given atomicrmw operation on values in registers, 489 /// returning the new value. 490 static Value *performAtomicOp(AtomicRMWInst::BinOp Op, IRBuilder<> &Builder, 491 Value *Loaded, Value *Inc) { 492 Value *NewVal; 493 switch (Op) { 494 case AtomicRMWInst::Xchg: 495 return Inc; 496 case AtomicRMWInst::Add: 497 return Builder.CreateAdd(Loaded, Inc, "new"); 498 case AtomicRMWInst::Sub: 499 return Builder.CreateSub(Loaded, Inc, "new"); 500 case AtomicRMWInst::And: 501 return Builder.CreateAnd(Loaded, Inc, "new"); 502 case AtomicRMWInst::Nand: 503 return Builder.CreateNot(Builder.CreateAnd(Loaded, Inc), "new"); 504 case AtomicRMWInst::Or: 505 return Builder.CreateOr(Loaded, Inc, "new"); 506 case AtomicRMWInst::Xor: 507 return Builder.CreateXor(Loaded, Inc, "new"); 508 case AtomicRMWInst::Max: 509 NewVal = Builder.CreateICmpSGT(Loaded, Inc); 510 return Builder.CreateSelect(NewVal, Loaded, Inc, "new"); 511 case AtomicRMWInst::Min: 512 NewVal = Builder.CreateICmpSLE(Loaded, Inc); 513 return Builder.CreateSelect(NewVal, Loaded, Inc, "new"); 514 case AtomicRMWInst::UMax: 515 NewVal = Builder.CreateICmpUGT(Loaded, Inc); 516 return Builder.CreateSelect(NewVal, Loaded, Inc, "new"); 517 case AtomicRMWInst::UMin: 518 NewVal = Builder.CreateICmpULE(Loaded, Inc); 519 return Builder.CreateSelect(NewVal, Loaded, Inc, "new"); 520 default: 521 llvm_unreachable("Unknown atomic op"); 522 } 523 } 524 525 bool AtomicExpand::tryExpandAtomicRMW(AtomicRMWInst *AI) { 526 switch (TLI->shouldExpandAtomicRMWInIR(AI)) { 527 case TargetLoweringBase::AtomicExpansionKind::None: 528 return false; 529 case TargetLoweringBase::AtomicExpansionKind::LLSC: { 530 unsigned MinCASSize = TLI->getMinCmpXchgSizeInBits() / 8; 531 unsigned ValueSize = getAtomicOpSize(AI); 532 if (ValueSize < MinCASSize) { 533 llvm_unreachable( 534 "MinCmpXchgSizeInBits not yet supported for LL/SC architectures."); 535 } else { 536 auto PerformOp = [&](IRBuilder<> &Builder, Value *Loaded) { 537 return performAtomicOp(AI->getOperation(), Builder, Loaded, 538 AI->getValOperand()); 539 }; 540 expandAtomicOpToLLSC(AI, AI->getType(), AI->getPointerOperand(), 541 AI->getOrdering(), PerformOp); 542 } 543 return true; 544 } 545 case TargetLoweringBase::AtomicExpansionKind::CmpXChg: { 546 unsigned MinCASSize = TLI->getMinCmpXchgSizeInBits() / 8; 547 unsigned ValueSize = getAtomicOpSize(AI); 548 if (ValueSize < MinCASSize) { 549 expandPartwordAtomicRMW(AI, 550 TargetLoweringBase::AtomicExpansionKind::CmpXChg); 551 } else { 552 expandAtomicRMWToCmpXchg(AI, createCmpXchgInstFun); 553 } 554 return true; 555 } 556 default: 557 llvm_unreachable("Unhandled case in tryExpandAtomicRMW"); 558 } 559 } 560 561 namespace { 562 563 /// Result values from createMaskInstrs helper. 564 struct PartwordMaskValues { 565 Type *WordType; 566 Type *ValueType; 567 Value *AlignedAddr; 568 Value *ShiftAmt; 569 Value *Mask; 570 Value *Inv_Mask; 571 }; 572 } // end anonymous namespace 573 574 /// This is a helper function which builds instructions to provide 575 /// values necessary for partword atomic operations. It takes an 576 /// incoming address, Addr, and ValueType, and constructs the address, 577 /// shift-amounts and masks needed to work with a larger value of size 578 /// WordSize. 579 /// 580 /// AlignedAddr: Addr rounded down to a multiple of WordSize 581 /// 582 /// ShiftAmt: Number of bits to right-shift a WordSize value loaded 583 /// from AlignAddr for it to have the same value as if 584 /// ValueType was loaded from Addr. 585 /// 586 /// Mask: Value to mask with the value loaded from AlignAddr to 587 /// include only the part that would've been loaded from Addr. 588 /// 589 /// Inv_Mask: The inverse of Mask. 590 591 static PartwordMaskValues createMaskInstrs(IRBuilder<> &Builder, Instruction *I, 592 Type *ValueType, Value *Addr, 593 unsigned WordSize) { 594 PartwordMaskValues Ret; 595 596 BasicBlock *BB = I->getParent(); 597 Function *F = BB->getParent(); 598 Module *M = I->getModule(); 599 600 LLVMContext &Ctx = F->getContext(); 601 const DataLayout &DL = M->getDataLayout(); 602 603 unsigned ValueSize = DL.getTypeStoreSize(ValueType); 604 605 assert(ValueSize < WordSize); 606 607 Ret.ValueType = ValueType; 608 Ret.WordType = Type::getIntNTy(Ctx, WordSize * 8); 609 610 Type *WordPtrType = 611 Ret.WordType->getPointerTo(Addr->getType()->getPointerAddressSpace()); 612 613 Value *AddrInt = Builder.CreatePtrToInt(Addr, DL.getIntPtrType(Ctx)); 614 Ret.AlignedAddr = Builder.CreateIntToPtr( 615 Builder.CreateAnd(AddrInt, ~(uint64_t)(WordSize - 1)), WordPtrType, 616 "AlignedAddr"); 617 618 Value *PtrLSB = Builder.CreateAnd(AddrInt, WordSize - 1, "PtrLSB"); 619 if (DL.isLittleEndian()) { 620 // turn bytes into bits 621 Ret.ShiftAmt = Builder.CreateShl(PtrLSB, 3); 622 } else { 623 // turn bytes into bits, and count from the other side. 624 Ret.ShiftAmt = 625 Builder.CreateShl(Builder.CreateXor(PtrLSB, WordSize - ValueSize), 3); 626 } 627 628 Ret.ShiftAmt = Builder.CreateTrunc(Ret.ShiftAmt, Ret.WordType, "ShiftAmt"); 629 Ret.Mask = Builder.CreateShl( 630 ConstantInt::get(Ret.WordType, (1 << ValueSize * 8) - 1), Ret.ShiftAmt, 631 "Mask"); 632 Ret.Inv_Mask = Builder.CreateNot(Ret.Mask, "Inv_Mask"); 633 634 return Ret; 635 } 636 637 /// Emit IR to implement a masked version of a given atomicrmw 638 /// operation. (That is, only the bits under the Mask should be 639 /// affected by the operation) 640 static Value *performMaskedAtomicOp(AtomicRMWInst::BinOp Op, 641 IRBuilder<> &Builder, Value *Loaded, 642 Value *Shifted_Inc, Value *Inc, 643 const PartwordMaskValues &PMV) { 644 switch (Op) { 645 case AtomicRMWInst::Xchg: { 646 Value *Loaded_MaskOut = Builder.CreateAnd(Loaded, PMV.Inv_Mask); 647 Value *FinalVal = Builder.CreateOr(Loaded_MaskOut, Shifted_Inc); 648 return FinalVal; 649 } 650 case AtomicRMWInst::Or: 651 case AtomicRMWInst::Xor: 652 // Or/Xor won't affect any other bits, so can just be done 653 // directly. 654 return performAtomicOp(Op, Builder, Loaded, Shifted_Inc); 655 case AtomicRMWInst::Add: 656 case AtomicRMWInst::Sub: 657 case AtomicRMWInst::And: 658 case AtomicRMWInst::Nand: { 659 // The other arithmetic ops need to be masked into place. 660 Value *NewVal = performAtomicOp(Op, Builder, Loaded, Shifted_Inc); 661 Value *NewVal_Masked = Builder.CreateAnd(NewVal, PMV.Mask); 662 Value *Loaded_MaskOut = Builder.CreateAnd(Loaded, PMV.Inv_Mask); 663 Value *FinalVal = Builder.CreateOr(Loaded_MaskOut, NewVal_Masked); 664 return FinalVal; 665 } 666 case AtomicRMWInst::Max: 667 case AtomicRMWInst::Min: 668 case AtomicRMWInst::UMax: 669 case AtomicRMWInst::UMin: { 670 // Finally, comparison ops will operate on the full value, so 671 // truncate down to the original size, and expand out again after 672 // doing the operation. 673 Value *Loaded_Shiftdown = Builder.CreateTrunc( 674 Builder.CreateLShr(Loaded, PMV.ShiftAmt), PMV.ValueType); 675 Value *NewVal = performAtomicOp(Op, Builder, Loaded_Shiftdown, Inc); 676 Value *NewVal_Shiftup = Builder.CreateShl( 677 Builder.CreateZExt(NewVal, PMV.WordType), PMV.ShiftAmt); 678 Value *Loaded_MaskOut = Builder.CreateAnd(Loaded, PMV.Inv_Mask); 679 Value *FinalVal = Builder.CreateOr(Loaded_MaskOut, NewVal_Shiftup); 680 return FinalVal; 681 } 682 default: 683 llvm_unreachable("Unknown atomic op"); 684 } 685 } 686 687 /// Expand a sub-word atomicrmw operation into an appropriate 688 /// word-sized operation. 689 /// 690 /// It will create an LL/SC or cmpxchg loop, as appropriate, the same 691 /// way as a typical atomicrmw expansion. The only difference here is 692 /// that the operation inside of the loop must operate only upon a 693 /// part of the value. 694 void AtomicExpand::expandPartwordAtomicRMW( 695 AtomicRMWInst *AI, TargetLoweringBase::AtomicExpansionKind ExpansionKind) { 696 697 assert(ExpansionKind == TargetLoweringBase::AtomicExpansionKind::CmpXChg); 698 699 AtomicOrdering MemOpOrder = AI->getOrdering(); 700 701 IRBuilder<> Builder(AI); 702 703 PartwordMaskValues PMV = 704 createMaskInstrs(Builder, AI, AI->getType(), AI->getPointerOperand(), 705 TLI->getMinCmpXchgSizeInBits() / 8); 706 707 Value *ValOperand_Shifted = 708 Builder.CreateShl(Builder.CreateZExt(AI->getValOperand(), PMV.WordType), 709 PMV.ShiftAmt, "ValOperand_Shifted"); 710 711 auto PerformPartwordOp = [&](IRBuilder<> &Builder, Value *Loaded) { 712 return performMaskedAtomicOp(AI->getOperation(), Builder, Loaded, 713 ValOperand_Shifted, AI->getValOperand(), PMV); 714 }; 715 716 // TODO: When we're ready to support LLSC conversions too, use 717 // insertRMWLLSCLoop here for ExpansionKind==LLSC. 718 Value *OldResult = 719 insertRMWCmpXchgLoop(Builder, PMV.WordType, PMV.AlignedAddr, MemOpOrder, 720 PerformPartwordOp, createCmpXchgInstFun); 721 Value *FinalOldResult = Builder.CreateTrunc( 722 Builder.CreateLShr(OldResult, PMV.ShiftAmt), PMV.ValueType); 723 AI->replaceAllUsesWith(FinalOldResult); 724 AI->eraseFromParent(); 725 } 726 727 void AtomicExpand::expandPartwordCmpXchg(AtomicCmpXchgInst *CI) { 728 // The basic idea here is that we're expanding a cmpxchg of a 729 // smaller memory size up to a word-sized cmpxchg. To do this, we 730 // need to add a retry-loop for strong cmpxchg, so that 731 // modifications to other parts of the word don't cause a spurious 732 // failure. 733 734 // This generates code like the following: 735 // [[Setup mask values PMV.*]] 736 // %NewVal_Shifted = shl i32 %NewVal, %PMV.ShiftAmt 737 // %Cmp_Shifted = shl i32 %Cmp, %PMV.ShiftAmt 738 // %InitLoaded = load i32* %addr 739 // %InitLoaded_MaskOut = and i32 %InitLoaded, %PMV.Inv_Mask 740 // br partword.cmpxchg.loop 741 // partword.cmpxchg.loop: 742 // %Loaded_MaskOut = phi i32 [ %InitLoaded_MaskOut, %entry ], 743 // [ %OldVal_MaskOut, %partword.cmpxchg.failure ] 744 // %FullWord_NewVal = or i32 %Loaded_MaskOut, %NewVal_Shifted 745 // %FullWord_Cmp = or i32 %Loaded_MaskOut, %Cmp_Shifted 746 // %NewCI = cmpxchg i32* %PMV.AlignedAddr, i32 %FullWord_Cmp, 747 // i32 %FullWord_NewVal success_ordering failure_ordering 748 // %OldVal = extractvalue { i32, i1 } %NewCI, 0 749 // %Success = extractvalue { i32, i1 } %NewCI, 1 750 // br i1 %Success, label %partword.cmpxchg.end, 751 // label %partword.cmpxchg.failure 752 // partword.cmpxchg.failure: 753 // %OldVal_MaskOut = and i32 %OldVal, %PMV.Inv_Mask 754 // %ShouldContinue = icmp ne i32 %Loaded_MaskOut, %OldVal_MaskOut 755 // br i1 %ShouldContinue, label %partword.cmpxchg.loop, 756 // label %partword.cmpxchg.end 757 // partword.cmpxchg.end: 758 // %tmp1 = lshr i32 %OldVal, %PMV.ShiftAmt 759 // %FinalOldVal = trunc i32 %tmp1 to i8 760 // %tmp2 = insertvalue { i8, i1 } undef, i8 %FinalOldVal, 0 761 // %Res = insertvalue { i8, i1 } %25, i1 %Success, 1 762 763 Value *Addr = CI->getPointerOperand(); 764 Value *Cmp = CI->getCompareOperand(); 765 Value *NewVal = CI->getNewValOperand(); 766 767 BasicBlock *BB = CI->getParent(); 768 Function *F = BB->getParent(); 769 IRBuilder<> Builder(CI); 770 LLVMContext &Ctx = Builder.getContext(); 771 772 const int WordSize = TLI->getMinCmpXchgSizeInBits() / 8; 773 774 BasicBlock *EndBB = 775 BB->splitBasicBlock(CI->getIterator(), "partword.cmpxchg.end"); 776 auto FailureBB = 777 BasicBlock::Create(Ctx, "partword.cmpxchg.failure", F, EndBB); 778 auto LoopBB = BasicBlock::Create(Ctx, "partword.cmpxchg.loop", F, FailureBB); 779 780 // The split call above "helpfully" added a branch at the end of BB 781 // (to the wrong place). 782 std::prev(BB->end())->eraseFromParent(); 783 Builder.SetInsertPoint(BB); 784 785 PartwordMaskValues PMV = createMaskInstrs( 786 Builder, CI, CI->getCompareOperand()->getType(), Addr, WordSize); 787 788 // Shift the incoming values over, into the right location in the word. 789 Value *NewVal_Shifted = 790 Builder.CreateShl(Builder.CreateZExt(NewVal, PMV.WordType), PMV.ShiftAmt); 791 Value *Cmp_Shifted = 792 Builder.CreateShl(Builder.CreateZExt(Cmp, PMV.WordType), PMV.ShiftAmt); 793 794 // Load the entire current word, and mask into place the expected and new 795 // values 796 LoadInst *InitLoaded = Builder.CreateLoad(PMV.WordType, PMV.AlignedAddr); 797 InitLoaded->setVolatile(CI->isVolatile()); 798 Value *InitLoaded_MaskOut = Builder.CreateAnd(InitLoaded, PMV.Inv_Mask); 799 Builder.CreateBr(LoopBB); 800 801 // partword.cmpxchg.loop: 802 Builder.SetInsertPoint(LoopBB); 803 PHINode *Loaded_MaskOut = Builder.CreatePHI(PMV.WordType, 2); 804 Loaded_MaskOut->addIncoming(InitLoaded_MaskOut, BB); 805 806 // Mask/Or the expected and new values into place in the loaded word. 807 Value *FullWord_NewVal = Builder.CreateOr(Loaded_MaskOut, NewVal_Shifted); 808 Value *FullWord_Cmp = Builder.CreateOr(Loaded_MaskOut, Cmp_Shifted); 809 AtomicCmpXchgInst *NewCI = Builder.CreateAtomicCmpXchg( 810 PMV.AlignedAddr, FullWord_Cmp, FullWord_NewVal, CI->getSuccessOrdering(), 811 CI->getFailureOrdering(), CI->getSynchScope()); 812 NewCI->setVolatile(CI->isVolatile()); 813 // When we're building a strong cmpxchg, we need a loop, so you 814 // might think we could use a weak cmpxchg inside. But, using strong 815 // allows the below comparison for ShouldContinue, and we're 816 // expecting the underlying cmpxchg to be a machine instruction, 817 // which is strong anyways. 818 NewCI->setWeak(CI->isWeak()); 819 820 Value *OldVal = Builder.CreateExtractValue(NewCI, 0); 821 Value *Success = Builder.CreateExtractValue(NewCI, 1); 822 823 if (CI->isWeak()) 824 Builder.CreateBr(EndBB); 825 else 826 Builder.CreateCondBr(Success, EndBB, FailureBB); 827 828 // partword.cmpxchg.failure: 829 Builder.SetInsertPoint(FailureBB); 830 // Upon failure, verify that the masked-out part of the loaded value 831 // has been modified. If it didn't, abort the cmpxchg, since the 832 // masked-in part must've. 833 Value *OldVal_MaskOut = Builder.CreateAnd(OldVal, PMV.Inv_Mask); 834 Value *ShouldContinue = Builder.CreateICmpNE(Loaded_MaskOut, OldVal_MaskOut); 835 Builder.CreateCondBr(ShouldContinue, LoopBB, EndBB); 836 837 // Add the second value to the phi from above 838 Loaded_MaskOut->addIncoming(OldVal_MaskOut, FailureBB); 839 840 // partword.cmpxchg.end: 841 Builder.SetInsertPoint(CI); 842 843 Value *FinalOldVal = Builder.CreateTrunc( 844 Builder.CreateLShr(OldVal, PMV.ShiftAmt), PMV.ValueType); 845 Value *Res = UndefValue::get(CI->getType()); 846 Res = Builder.CreateInsertValue(Res, FinalOldVal, 0); 847 Res = Builder.CreateInsertValue(Res, Success, 1); 848 849 CI->replaceAllUsesWith(Res); 850 CI->eraseFromParent(); 851 } 852 853 void AtomicExpand::expandAtomicOpToLLSC( 854 Instruction *I, Type *ResultType, Value *Addr, AtomicOrdering MemOpOrder, 855 function_ref<Value *(IRBuilder<> &, Value *)> PerformOp) { 856 IRBuilder<> Builder(I); 857 Value *Loaded = 858 insertRMWLLSCLoop(Builder, ResultType, Addr, MemOpOrder, PerformOp); 859 860 I->replaceAllUsesWith(Loaded); 861 I->eraseFromParent(); 862 } 863 864 Value *AtomicExpand::insertRMWLLSCLoop( 865 IRBuilder<> &Builder, Type *ResultTy, Value *Addr, 866 AtomicOrdering MemOpOrder, 867 function_ref<Value *(IRBuilder<> &, Value *)> PerformOp) { 868 LLVMContext &Ctx = Builder.getContext(); 869 BasicBlock *BB = Builder.GetInsertBlock(); 870 Function *F = BB->getParent(); 871 872 // Given: atomicrmw some_op iN* %addr, iN %incr ordering 873 // 874 // The standard expansion we produce is: 875 // [...] 876 // atomicrmw.start: 877 // %loaded = @load.linked(%addr) 878 // %new = some_op iN %loaded, %incr 879 // %stored = @store_conditional(%new, %addr) 880 // %try_again = icmp i32 ne %stored, 0 881 // br i1 %try_again, label %loop, label %atomicrmw.end 882 // atomicrmw.end: 883 // [...] 884 BasicBlock *ExitBB = 885 BB->splitBasicBlock(Builder.GetInsertPoint(), "atomicrmw.end"); 886 BasicBlock *LoopBB = BasicBlock::Create(Ctx, "atomicrmw.start", F, ExitBB); 887 888 // The split call above "helpfully" added a branch at the end of BB (to the 889 // wrong place). 890 std::prev(BB->end())->eraseFromParent(); 891 Builder.SetInsertPoint(BB); 892 Builder.CreateBr(LoopBB); 893 894 // Start the main loop block now that we've taken care of the preliminaries. 895 Builder.SetInsertPoint(LoopBB); 896 Value *Loaded = TLI->emitLoadLinked(Builder, Addr, MemOpOrder); 897 898 Value *NewVal = PerformOp(Builder, Loaded); 899 900 Value *StoreSuccess = 901 TLI->emitStoreConditional(Builder, NewVal, Addr, MemOpOrder); 902 Value *TryAgain = Builder.CreateICmpNE( 903 StoreSuccess, ConstantInt::get(IntegerType::get(Ctx, 32), 0), "tryagain"); 904 Builder.CreateCondBr(TryAgain, LoopBB, ExitBB); 905 906 Builder.SetInsertPoint(ExitBB, ExitBB->begin()); 907 return Loaded; 908 } 909 910 /// Convert an atomic cmpxchg of a non-integral type to an integer cmpxchg of 911 /// the equivalent bitwidth. We used to not support pointer cmpxchg in the 912 /// IR. As a migration step, we convert back to what use to be the standard 913 /// way to represent a pointer cmpxchg so that we can update backends one by 914 /// one. 915 AtomicCmpXchgInst *AtomicExpand::convertCmpXchgToIntegerType(AtomicCmpXchgInst *CI) { 916 auto *M = CI->getModule(); 917 Type *NewTy = getCorrespondingIntegerType(CI->getCompareOperand()->getType(), 918 M->getDataLayout()); 919 920 IRBuilder<> Builder(CI); 921 922 Value *Addr = CI->getPointerOperand(); 923 Type *PT = PointerType::get(NewTy, 924 Addr->getType()->getPointerAddressSpace()); 925 Value *NewAddr = Builder.CreateBitCast(Addr, PT); 926 927 Value *NewCmp = Builder.CreatePtrToInt(CI->getCompareOperand(), NewTy); 928 Value *NewNewVal = Builder.CreatePtrToInt(CI->getNewValOperand(), NewTy); 929 930 931 auto *NewCI = Builder.CreateAtomicCmpXchg(NewAddr, NewCmp, NewNewVal, 932 CI->getSuccessOrdering(), 933 CI->getFailureOrdering(), 934 CI->getSynchScope()); 935 NewCI->setVolatile(CI->isVolatile()); 936 NewCI->setWeak(CI->isWeak()); 937 DEBUG(dbgs() << "Replaced " << *CI << " with " << *NewCI << "\n"); 938 939 Value *OldVal = Builder.CreateExtractValue(NewCI, 0); 940 Value *Succ = Builder.CreateExtractValue(NewCI, 1); 941 942 OldVal = Builder.CreateIntToPtr(OldVal, CI->getCompareOperand()->getType()); 943 944 Value *Res = UndefValue::get(CI->getType()); 945 Res = Builder.CreateInsertValue(Res, OldVal, 0); 946 Res = Builder.CreateInsertValue(Res, Succ, 1); 947 948 CI->replaceAllUsesWith(Res); 949 CI->eraseFromParent(); 950 return NewCI; 951 } 952 953 954 bool AtomicExpand::expandAtomicCmpXchg(AtomicCmpXchgInst *CI) { 955 AtomicOrdering SuccessOrder = CI->getSuccessOrdering(); 956 AtomicOrdering FailureOrder = CI->getFailureOrdering(); 957 Value *Addr = CI->getPointerOperand(); 958 BasicBlock *BB = CI->getParent(); 959 Function *F = BB->getParent(); 960 LLVMContext &Ctx = F->getContext(); 961 // If shouldInsertFencesForAtomic() returns true, then the target does not 962 // want to deal with memory orders, and emitLeading/TrailingFence should take 963 // care of everything. Otherwise, emitLeading/TrailingFence are no-op and we 964 // should preserve the ordering. 965 bool ShouldInsertFencesForAtomic = TLI->shouldInsertFencesForAtomic(CI); 966 AtomicOrdering MemOpOrder = 967 ShouldInsertFencesForAtomic ? AtomicOrdering::Monotonic : SuccessOrder; 968 969 // In implementations which use a barrier to achieve release semantics, we can 970 // delay emitting this barrier until we know a store is actually going to be 971 // attempted. The cost of this delay is that we need 2 copies of the block 972 // emitting the load-linked, affecting code size. 973 // 974 // Ideally, this logic would be unconditional except for the minsize check 975 // since in other cases the extra blocks naturally collapse down to the 976 // minimal loop. Unfortunately, this puts too much stress on later 977 // optimisations so we avoid emitting the extra logic in those cases too. 978 bool HasReleasedLoadBB = !CI->isWeak() && ShouldInsertFencesForAtomic && 979 SuccessOrder != AtomicOrdering::Monotonic && 980 SuccessOrder != AtomicOrdering::Acquire && 981 !F->optForMinSize(); 982 983 // There's no overhead for sinking the release barrier in a weak cmpxchg, so 984 // do it even on minsize. 985 bool UseUnconditionalReleaseBarrier = F->optForMinSize() && !CI->isWeak(); 986 987 // Given: cmpxchg some_op iN* %addr, iN %desired, iN %new success_ord fail_ord 988 // 989 // The full expansion we produce is: 990 // [...] 991 // cmpxchg.start: 992 // %unreleasedload = @load.linked(%addr) 993 // %should_store = icmp eq %unreleasedload, %desired 994 // br i1 %should_store, label %cmpxchg.fencedstore, 995 // label %cmpxchg.nostore 996 // cmpxchg.releasingstore: 997 // fence? 998 // br label cmpxchg.trystore 999 // cmpxchg.trystore: 1000 // %loaded.trystore = phi [%unreleasedload, %releasingstore], 1001 // [%releasedload, %cmpxchg.releasedload] 1002 // %stored = @store_conditional(%new, %addr) 1003 // %success = icmp eq i32 %stored, 0 1004 // br i1 %success, label %cmpxchg.success, 1005 // label %cmpxchg.releasedload/%cmpxchg.failure 1006 // cmpxchg.releasedload: 1007 // %releasedload = @load.linked(%addr) 1008 // %should_store = icmp eq %releasedload, %desired 1009 // br i1 %should_store, label %cmpxchg.trystore, 1010 // label %cmpxchg.failure 1011 // cmpxchg.success: 1012 // fence? 1013 // br label %cmpxchg.end 1014 // cmpxchg.nostore: 1015 // %loaded.nostore = phi [%unreleasedload, %cmpxchg.start], 1016 // [%releasedload, 1017 // %cmpxchg.releasedload/%cmpxchg.trystore] 1018 // @load_linked_fail_balance()? 1019 // br label %cmpxchg.failure 1020 // cmpxchg.failure: 1021 // fence? 1022 // br label %cmpxchg.end 1023 // cmpxchg.end: 1024 // %loaded = phi [%loaded.nostore, %cmpxchg.failure], 1025 // [%loaded.trystore, %cmpxchg.trystore] 1026 // %success = phi i1 [true, %cmpxchg.success], [false, %cmpxchg.failure] 1027 // %restmp = insertvalue { iN, i1 } undef, iN %loaded, 0 1028 // %res = insertvalue { iN, i1 } %restmp, i1 %success, 1 1029 // [...] 1030 BasicBlock *ExitBB = BB->splitBasicBlock(CI->getIterator(), "cmpxchg.end"); 1031 auto FailureBB = BasicBlock::Create(Ctx, "cmpxchg.failure", F, ExitBB); 1032 auto NoStoreBB = BasicBlock::Create(Ctx, "cmpxchg.nostore", F, FailureBB); 1033 auto SuccessBB = BasicBlock::Create(Ctx, "cmpxchg.success", F, NoStoreBB); 1034 auto ReleasedLoadBB = 1035 BasicBlock::Create(Ctx, "cmpxchg.releasedload", F, SuccessBB); 1036 auto TryStoreBB = 1037 BasicBlock::Create(Ctx, "cmpxchg.trystore", F, ReleasedLoadBB); 1038 auto ReleasingStoreBB = 1039 BasicBlock::Create(Ctx, "cmpxchg.fencedstore", F, TryStoreBB); 1040 auto StartBB = BasicBlock::Create(Ctx, "cmpxchg.start", F, ReleasingStoreBB); 1041 1042 // This grabs the DebugLoc from CI 1043 IRBuilder<> Builder(CI); 1044 1045 // The split call above "helpfully" added a branch at the end of BB (to the 1046 // wrong place), but we might want a fence too. It's easiest to just remove 1047 // the branch entirely. 1048 std::prev(BB->end())->eraseFromParent(); 1049 Builder.SetInsertPoint(BB); 1050 if (ShouldInsertFencesForAtomic && UseUnconditionalReleaseBarrier) 1051 TLI->emitLeadingFence(Builder, SuccessOrder, /*IsStore=*/true, 1052 /*IsLoad=*/true); 1053 Builder.CreateBr(StartBB); 1054 1055 // Start the main loop block now that we've taken care of the preliminaries. 1056 Builder.SetInsertPoint(StartBB); 1057 Value *UnreleasedLoad = TLI->emitLoadLinked(Builder, Addr, MemOpOrder); 1058 Value *ShouldStore = Builder.CreateICmpEQ( 1059 UnreleasedLoad, CI->getCompareOperand(), "should_store"); 1060 1061 // If the cmpxchg doesn't actually need any ordering when it fails, we can 1062 // jump straight past that fence instruction (if it exists). 1063 Builder.CreateCondBr(ShouldStore, ReleasingStoreBB, NoStoreBB); 1064 1065 Builder.SetInsertPoint(ReleasingStoreBB); 1066 if (ShouldInsertFencesForAtomic && !UseUnconditionalReleaseBarrier) 1067 TLI->emitLeadingFence(Builder, SuccessOrder, /*IsStore=*/true, 1068 /*IsLoad=*/true); 1069 Builder.CreateBr(TryStoreBB); 1070 1071 Builder.SetInsertPoint(TryStoreBB); 1072 Value *StoreSuccess = TLI->emitStoreConditional( 1073 Builder, CI->getNewValOperand(), Addr, MemOpOrder); 1074 StoreSuccess = Builder.CreateICmpEQ( 1075 StoreSuccess, ConstantInt::get(Type::getInt32Ty(Ctx), 0), "success"); 1076 BasicBlock *RetryBB = HasReleasedLoadBB ? ReleasedLoadBB : StartBB; 1077 Builder.CreateCondBr(StoreSuccess, SuccessBB, 1078 CI->isWeak() ? FailureBB : RetryBB); 1079 1080 Builder.SetInsertPoint(ReleasedLoadBB); 1081 Value *SecondLoad; 1082 if (HasReleasedLoadBB) { 1083 SecondLoad = TLI->emitLoadLinked(Builder, Addr, MemOpOrder); 1084 ShouldStore = Builder.CreateICmpEQ(SecondLoad, CI->getCompareOperand(), 1085 "should_store"); 1086 1087 // If the cmpxchg doesn't actually need any ordering when it fails, we can 1088 // jump straight past that fence instruction (if it exists). 1089 Builder.CreateCondBr(ShouldStore, TryStoreBB, NoStoreBB); 1090 } else 1091 Builder.CreateUnreachable(); 1092 1093 // Make sure later instructions don't get reordered with a fence if 1094 // necessary. 1095 Builder.SetInsertPoint(SuccessBB); 1096 if (ShouldInsertFencesForAtomic) 1097 TLI->emitTrailingFence(Builder, SuccessOrder, /*IsStore=*/true, 1098 /*IsLoad=*/true); 1099 Builder.CreateBr(ExitBB); 1100 1101 Builder.SetInsertPoint(NoStoreBB); 1102 // In the failing case, where we don't execute the store-conditional, the 1103 // target might want to balance out the load-linked with a dedicated 1104 // instruction (e.g., on ARM, clearing the exclusive monitor). 1105 TLI->emitAtomicCmpXchgNoStoreLLBalance(Builder); 1106 Builder.CreateBr(FailureBB); 1107 1108 Builder.SetInsertPoint(FailureBB); 1109 if (ShouldInsertFencesForAtomic) 1110 TLI->emitTrailingFence(Builder, FailureOrder, /*IsStore=*/true, 1111 /*IsLoad=*/true); 1112 Builder.CreateBr(ExitBB); 1113 1114 // Finally, we have control-flow based knowledge of whether the cmpxchg 1115 // succeeded or not. We expose this to later passes by converting any 1116 // subsequent "icmp eq/ne %loaded, %oldval" into a use of an appropriate 1117 // PHI. 1118 Builder.SetInsertPoint(ExitBB, ExitBB->begin()); 1119 PHINode *Success = Builder.CreatePHI(Type::getInt1Ty(Ctx), 2); 1120 Success->addIncoming(ConstantInt::getTrue(Ctx), SuccessBB); 1121 Success->addIncoming(ConstantInt::getFalse(Ctx), FailureBB); 1122 1123 // Setup the builder so we can create any PHIs we need. 1124 Value *Loaded; 1125 if (!HasReleasedLoadBB) 1126 Loaded = UnreleasedLoad; 1127 else { 1128 Builder.SetInsertPoint(TryStoreBB, TryStoreBB->begin()); 1129 PHINode *TryStoreLoaded = Builder.CreatePHI(UnreleasedLoad->getType(), 2); 1130 TryStoreLoaded->addIncoming(UnreleasedLoad, ReleasingStoreBB); 1131 TryStoreLoaded->addIncoming(SecondLoad, ReleasedLoadBB); 1132 1133 Builder.SetInsertPoint(NoStoreBB, NoStoreBB->begin()); 1134 PHINode *NoStoreLoaded = Builder.CreatePHI(UnreleasedLoad->getType(), 2); 1135 NoStoreLoaded->addIncoming(UnreleasedLoad, StartBB); 1136 NoStoreLoaded->addIncoming(SecondLoad, ReleasedLoadBB); 1137 1138 Builder.SetInsertPoint(ExitBB, ++ExitBB->begin()); 1139 PHINode *ExitLoaded = Builder.CreatePHI(UnreleasedLoad->getType(), 2); 1140 ExitLoaded->addIncoming(TryStoreLoaded, SuccessBB); 1141 ExitLoaded->addIncoming(NoStoreLoaded, FailureBB); 1142 1143 Loaded = ExitLoaded; 1144 } 1145 1146 // Look for any users of the cmpxchg that are just comparing the loaded value 1147 // against the desired one, and replace them with the CFG-derived version. 1148 SmallVector<ExtractValueInst *, 2> PrunedInsts; 1149 for (auto User : CI->users()) { 1150 ExtractValueInst *EV = dyn_cast<ExtractValueInst>(User); 1151 if (!EV) 1152 continue; 1153 1154 assert(EV->getNumIndices() == 1 && EV->getIndices()[0] <= 1 && 1155 "weird extraction from { iN, i1 }"); 1156 1157 if (EV->getIndices()[0] == 0) 1158 EV->replaceAllUsesWith(Loaded); 1159 else 1160 EV->replaceAllUsesWith(Success); 1161 1162 PrunedInsts.push_back(EV); 1163 } 1164 1165 // We can remove the instructions now we're no longer iterating through them. 1166 for (auto EV : PrunedInsts) 1167 EV->eraseFromParent(); 1168 1169 if (!CI->use_empty()) { 1170 // Some use of the full struct return that we don't understand has happened, 1171 // so we've got to reconstruct it properly. 1172 Value *Res; 1173 Res = Builder.CreateInsertValue(UndefValue::get(CI->getType()), Loaded, 0); 1174 Res = Builder.CreateInsertValue(Res, Success, 1); 1175 1176 CI->replaceAllUsesWith(Res); 1177 } 1178 1179 CI->eraseFromParent(); 1180 return true; 1181 } 1182 1183 bool AtomicExpand::isIdempotentRMW(AtomicRMWInst* RMWI) { 1184 auto C = dyn_cast<ConstantInt>(RMWI->getValOperand()); 1185 if(!C) 1186 return false; 1187 1188 AtomicRMWInst::BinOp Op = RMWI->getOperation(); 1189 switch(Op) { 1190 case AtomicRMWInst::Add: 1191 case AtomicRMWInst::Sub: 1192 case AtomicRMWInst::Or: 1193 case AtomicRMWInst::Xor: 1194 return C->isZero(); 1195 case AtomicRMWInst::And: 1196 return C->isMinusOne(); 1197 // FIXME: we could also treat Min/Max/UMin/UMax by the INT_MIN/INT_MAX/... 1198 default: 1199 return false; 1200 } 1201 } 1202 1203 bool AtomicExpand::simplifyIdempotentRMW(AtomicRMWInst* RMWI) { 1204 if (auto ResultingLoad = TLI->lowerIdempotentRMWIntoFencedLoad(RMWI)) { 1205 tryExpandAtomicLoad(ResultingLoad); 1206 return true; 1207 } 1208 return false; 1209 } 1210 1211 Value *AtomicExpand::insertRMWCmpXchgLoop( 1212 IRBuilder<> &Builder, Type *ResultTy, Value *Addr, 1213 AtomicOrdering MemOpOrder, 1214 function_ref<Value *(IRBuilder<> &, Value *)> PerformOp, 1215 CreateCmpXchgInstFun CreateCmpXchg) { 1216 LLVMContext &Ctx = Builder.getContext(); 1217 BasicBlock *BB = Builder.GetInsertBlock(); 1218 Function *F = BB->getParent(); 1219 1220 // Given: atomicrmw some_op iN* %addr, iN %incr ordering 1221 // 1222 // The standard expansion we produce is: 1223 // [...] 1224 // %init_loaded = load atomic iN* %addr 1225 // br label %loop 1226 // loop: 1227 // %loaded = phi iN [ %init_loaded, %entry ], [ %new_loaded, %loop ] 1228 // %new = some_op iN %loaded, %incr 1229 // %pair = cmpxchg iN* %addr, iN %loaded, iN %new 1230 // %new_loaded = extractvalue { iN, i1 } %pair, 0 1231 // %success = extractvalue { iN, i1 } %pair, 1 1232 // br i1 %success, label %atomicrmw.end, label %loop 1233 // atomicrmw.end: 1234 // [...] 1235 BasicBlock *ExitBB = 1236 BB->splitBasicBlock(Builder.GetInsertPoint(), "atomicrmw.end"); 1237 BasicBlock *LoopBB = BasicBlock::Create(Ctx, "atomicrmw.start", F, ExitBB); 1238 1239 // The split call above "helpfully" added a branch at the end of BB (to the 1240 // wrong place), but we want a load. It's easiest to just remove 1241 // the branch entirely. 1242 std::prev(BB->end())->eraseFromParent(); 1243 Builder.SetInsertPoint(BB); 1244 LoadInst *InitLoaded = Builder.CreateLoad(ResultTy, Addr); 1245 // Atomics require at least natural alignment. 1246 InitLoaded->setAlignment(ResultTy->getPrimitiveSizeInBits() / 8); 1247 Builder.CreateBr(LoopBB); 1248 1249 // Start the main loop block now that we've taken care of the preliminaries. 1250 Builder.SetInsertPoint(LoopBB); 1251 PHINode *Loaded = Builder.CreatePHI(ResultTy, 2, "loaded"); 1252 Loaded->addIncoming(InitLoaded, BB); 1253 1254 Value *NewVal = PerformOp(Builder, Loaded); 1255 1256 Value *NewLoaded = nullptr; 1257 Value *Success = nullptr; 1258 1259 CreateCmpXchg(Builder, Addr, Loaded, NewVal, 1260 MemOpOrder == AtomicOrdering::Unordered 1261 ? AtomicOrdering::Monotonic 1262 : MemOpOrder, 1263 Success, NewLoaded); 1264 assert(Success && NewLoaded); 1265 1266 Loaded->addIncoming(NewLoaded, LoopBB); 1267 1268 Builder.CreateCondBr(Success, ExitBB, LoopBB); 1269 1270 Builder.SetInsertPoint(ExitBB, ExitBB->begin()); 1271 return NewLoaded; 1272 } 1273 1274 // Note: This function is exposed externally by AtomicExpandUtils.h 1275 bool llvm::expandAtomicRMWToCmpXchg(AtomicRMWInst *AI, 1276 CreateCmpXchgInstFun CreateCmpXchg) { 1277 IRBuilder<> Builder(AI); 1278 Value *Loaded = AtomicExpand::insertRMWCmpXchgLoop( 1279 Builder, AI->getType(), AI->getPointerOperand(), AI->getOrdering(), 1280 [&](IRBuilder<> &Builder, Value *Loaded) { 1281 return performAtomicOp(AI->getOperation(), Builder, Loaded, 1282 AI->getValOperand()); 1283 }, 1284 CreateCmpXchg); 1285 1286 AI->replaceAllUsesWith(Loaded); 1287 AI->eraseFromParent(); 1288 return true; 1289 } 1290 1291 // In order to use one of the sized library calls such as 1292 // __atomic_fetch_add_4, the alignment must be sufficient, the size 1293 // must be one of the potentially-specialized sizes, and the value 1294 // type must actually exist in C on the target (otherwise, the 1295 // function wouldn't actually be defined.) 1296 static bool canUseSizedAtomicCall(unsigned Size, unsigned Align, 1297 const DataLayout &DL) { 1298 // TODO: "LargestSize" is an approximation for "largest type that 1299 // you can express in C". It seems to be the case that int128 is 1300 // supported on all 64-bit platforms, otherwise only up to 64-bit 1301 // integers are supported. If we get this wrong, then we'll try to 1302 // call a sized libcall that doesn't actually exist. There should 1303 // really be some more reliable way in LLVM of determining integer 1304 // sizes which are valid in the target's C ABI... 1305 unsigned LargestSize = DL.getLargestLegalIntTypeSizeInBits() >= 64 ? 16 : 8; 1306 return Align >= Size && 1307 (Size == 1 || Size == 2 || Size == 4 || Size == 8 || Size == 16) && 1308 Size <= LargestSize; 1309 } 1310 1311 void AtomicExpand::expandAtomicLoadToLibcall(LoadInst *I) { 1312 static const RTLIB::Libcall Libcalls[6] = { 1313 RTLIB::ATOMIC_LOAD, RTLIB::ATOMIC_LOAD_1, RTLIB::ATOMIC_LOAD_2, 1314 RTLIB::ATOMIC_LOAD_4, RTLIB::ATOMIC_LOAD_8, RTLIB::ATOMIC_LOAD_16}; 1315 unsigned Size = getAtomicOpSize(I); 1316 unsigned Align = getAtomicOpAlign(I); 1317 1318 bool expanded = expandAtomicOpToLibcall( 1319 I, Size, Align, I->getPointerOperand(), nullptr, nullptr, 1320 I->getOrdering(), AtomicOrdering::NotAtomic, Libcalls); 1321 (void)expanded; 1322 assert(expanded && "expandAtomicOpToLibcall shouldn't fail tor Load"); 1323 } 1324 1325 void AtomicExpand::expandAtomicStoreToLibcall(StoreInst *I) { 1326 static const RTLIB::Libcall Libcalls[6] = { 1327 RTLIB::ATOMIC_STORE, RTLIB::ATOMIC_STORE_1, RTLIB::ATOMIC_STORE_2, 1328 RTLIB::ATOMIC_STORE_4, RTLIB::ATOMIC_STORE_8, RTLIB::ATOMIC_STORE_16}; 1329 unsigned Size = getAtomicOpSize(I); 1330 unsigned Align = getAtomicOpAlign(I); 1331 1332 bool expanded = expandAtomicOpToLibcall( 1333 I, Size, Align, I->getPointerOperand(), I->getValueOperand(), nullptr, 1334 I->getOrdering(), AtomicOrdering::NotAtomic, Libcalls); 1335 (void)expanded; 1336 assert(expanded && "expandAtomicOpToLibcall shouldn't fail tor Store"); 1337 } 1338 1339 void AtomicExpand::expandAtomicCASToLibcall(AtomicCmpXchgInst *I) { 1340 static const RTLIB::Libcall Libcalls[6] = { 1341 RTLIB::ATOMIC_COMPARE_EXCHANGE, RTLIB::ATOMIC_COMPARE_EXCHANGE_1, 1342 RTLIB::ATOMIC_COMPARE_EXCHANGE_2, RTLIB::ATOMIC_COMPARE_EXCHANGE_4, 1343 RTLIB::ATOMIC_COMPARE_EXCHANGE_8, RTLIB::ATOMIC_COMPARE_EXCHANGE_16}; 1344 unsigned Size = getAtomicOpSize(I); 1345 unsigned Align = getAtomicOpAlign(I); 1346 1347 bool expanded = expandAtomicOpToLibcall( 1348 I, Size, Align, I->getPointerOperand(), I->getNewValOperand(), 1349 I->getCompareOperand(), I->getSuccessOrdering(), I->getFailureOrdering(), 1350 Libcalls); 1351 (void)expanded; 1352 assert(expanded && "expandAtomicOpToLibcall shouldn't fail tor CAS"); 1353 } 1354 1355 static ArrayRef<RTLIB::Libcall> GetRMWLibcall(AtomicRMWInst::BinOp Op) { 1356 static const RTLIB::Libcall LibcallsXchg[6] = { 1357 RTLIB::ATOMIC_EXCHANGE, RTLIB::ATOMIC_EXCHANGE_1, 1358 RTLIB::ATOMIC_EXCHANGE_2, RTLIB::ATOMIC_EXCHANGE_4, 1359 RTLIB::ATOMIC_EXCHANGE_8, RTLIB::ATOMIC_EXCHANGE_16}; 1360 static const RTLIB::Libcall LibcallsAdd[6] = { 1361 RTLIB::UNKNOWN_LIBCALL, RTLIB::ATOMIC_FETCH_ADD_1, 1362 RTLIB::ATOMIC_FETCH_ADD_2, RTLIB::ATOMIC_FETCH_ADD_4, 1363 RTLIB::ATOMIC_FETCH_ADD_8, RTLIB::ATOMIC_FETCH_ADD_16}; 1364 static const RTLIB::Libcall LibcallsSub[6] = { 1365 RTLIB::UNKNOWN_LIBCALL, RTLIB::ATOMIC_FETCH_SUB_1, 1366 RTLIB::ATOMIC_FETCH_SUB_2, RTLIB::ATOMIC_FETCH_SUB_4, 1367 RTLIB::ATOMIC_FETCH_SUB_8, RTLIB::ATOMIC_FETCH_SUB_16}; 1368 static const RTLIB::Libcall LibcallsAnd[6] = { 1369 RTLIB::UNKNOWN_LIBCALL, RTLIB::ATOMIC_FETCH_AND_1, 1370 RTLIB::ATOMIC_FETCH_AND_2, RTLIB::ATOMIC_FETCH_AND_4, 1371 RTLIB::ATOMIC_FETCH_AND_8, RTLIB::ATOMIC_FETCH_AND_16}; 1372 static const RTLIB::Libcall LibcallsOr[6] = { 1373 RTLIB::UNKNOWN_LIBCALL, RTLIB::ATOMIC_FETCH_OR_1, 1374 RTLIB::ATOMIC_FETCH_OR_2, RTLIB::ATOMIC_FETCH_OR_4, 1375 RTLIB::ATOMIC_FETCH_OR_8, RTLIB::ATOMIC_FETCH_OR_16}; 1376 static const RTLIB::Libcall LibcallsXor[6] = { 1377 RTLIB::UNKNOWN_LIBCALL, RTLIB::ATOMIC_FETCH_XOR_1, 1378 RTLIB::ATOMIC_FETCH_XOR_2, RTLIB::ATOMIC_FETCH_XOR_4, 1379 RTLIB::ATOMIC_FETCH_XOR_8, RTLIB::ATOMIC_FETCH_XOR_16}; 1380 static const RTLIB::Libcall LibcallsNand[6] = { 1381 RTLIB::UNKNOWN_LIBCALL, RTLIB::ATOMIC_FETCH_NAND_1, 1382 RTLIB::ATOMIC_FETCH_NAND_2, RTLIB::ATOMIC_FETCH_NAND_4, 1383 RTLIB::ATOMIC_FETCH_NAND_8, RTLIB::ATOMIC_FETCH_NAND_16}; 1384 1385 switch (Op) { 1386 case AtomicRMWInst::BAD_BINOP: 1387 llvm_unreachable("Should not have BAD_BINOP."); 1388 case AtomicRMWInst::Xchg: 1389 return makeArrayRef(LibcallsXchg); 1390 case AtomicRMWInst::Add: 1391 return makeArrayRef(LibcallsAdd); 1392 case AtomicRMWInst::Sub: 1393 return makeArrayRef(LibcallsSub); 1394 case AtomicRMWInst::And: 1395 return makeArrayRef(LibcallsAnd); 1396 case AtomicRMWInst::Or: 1397 return makeArrayRef(LibcallsOr); 1398 case AtomicRMWInst::Xor: 1399 return makeArrayRef(LibcallsXor); 1400 case AtomicRMWInst::Nand: 1401 return makeArrayRef(LibcallsNand); 1402 case AtomicRMWInst::Max: 1403 case AtomicRMWInst::Min: 1404 case AtomicRMWInst::UMax: 1405 case AtomicRMWInst::UMin: 1406 // No atomic libcalls are available for max/min/umax/umin. 1407 return {}; 1408 } 1409 llvm_unreachable("Unexpected AtomicRMW operation."); 1410 } 1411 1412 void AtomicExpand::expandAtomicRMWToLibcall(AtomicRMWInst *I) { 1413 ArrayRef<RTLIB::Libcall> Libcalls = GetRMWLibcall(I->getOperation()); 1414 1415 unsigned Size = getAtomicOpSize(I); 1416 unsigned Align = getAtomicOpAlign(I); 1417 1418 bool Success = false; 1419 if (!Libcalls.empty()) 1420 Success = expandAtomicOpToLibcall( 1421 I, Size, Align, I->getPointerOperand(), I->getValOperand(), nullptr, 1422 I->getOrdering(), AtomicOrdering::NotAtomic, Libcalls); 1423 1424 // The expansion failed: either there were no libcalls at all for 1425 // the operation (min/max), or there were only size-specialized 1426 // libcalls (add/sub/etc) and we needed a generic. So, expand to a 1427 // CAS libcall, via a CAS loop, instead. 1428 if (!Success) { 1429 expandAtomicRMWToCmpXchg(I, [this](IRBuilder<> &Builder, Value *Addr, 1430 Value *Loaded, Value *NewVal, 1431 AtomicOrdering MemOpOrder, 1432 Value *&Success, Value *&NewLoaded) { 1433 // Create the CAS instruction normally... 1434 AtomicCmpXchgInst *Pair = Builder.CreateAtomicCmpXchg( 1435 Addr, Loaded, NewVal, MemOpOrder, 1436 AtomicCmpXchgInst::getStrongestFailureOrdering(MemOpOrder)); 1437 Success = Builder.CreateExtractValue(Pair, 1, "success"); 1438 NewLoaded = Builder.CreateExtractValue(Pair, 0, "newloaded"); 1439 1440 // ...and then expand the CAS into a libcall. 1441 expandAtomicCASToLibcall(Pair); 1442 }); 1443 } 1444 } 1445 1446 // A helper routine for the above expandAtomic*ToLibcall functions. 1447 // 1448 // 'Libcalls' contains an array of enum values for the particular 1449 // ATOMIC libcalls to be emitted. All of the other arguments besides 1450 // 'I' are extracted from the Instruction subclass by the 1451 // caller. Depending on the particular call, some will be null. 1452 bool AtomicExpand::expandAtomicOpToLibcall( 1453 Instruction *I, unsigned Size, unsigned Align, Value *PointerOperand, 1454 Value *ValueOperand, Value *CASExpected, AtomicOrdering Ordering, 1455 AtomicOrdering Ordering2, ArrayRef<RTLIB::Libcall> Libcalls) { 1456 assert(Libcalls.size() == 6); 1457 1458 LLVMContext &Ctx = I->getContext(); 1459 Module *M = I->getModule(); 1460 const DataLayout &DL = M->getDataLayout(); 1461 IRBuilder<> Builder(I); 1462 IRBuilder<> AllocaBuilder(&I->getFunction()->getEntryBlock().front()); 1463 1464 bool UseSizedLibcall = canUseSizedAtomicCall(Size, Align, DL); 1465 Type *SizedIntTy = Type::getIntNTy(Ctx, Size * 8); 1466 1467 unsigned AllocaAlignment = DL.getPrefTypeAlignment(SizedIntTy); 1468 1469 // TODO: the "order" argument type is "int", not int32. So 1470 // getInt32Ty may be wrong if the arch uses e.g. 16-bit ints. 1471 ConstantInt *SizeVal64 = ConstantInt::get(Type::getInt64Ty(Ctx), Size); 1472 assert(Ordering != AtomicOrdering::NotAtomic && "expect atomic MO"); 1473 Constant *OrderingVal = 1474 ConstantInt::get(Type::getInt32Ty(Ctx), (int)toCABI(Ordering)); 1475 Constant *Ordering2Val = nullptr; 1476 if (CASExpected) { 1477 assert(Ordering2 != AtomicOrdering::NotAtomic && "expect atomic MO"); 1478 Ordering2Val = 1479 ConstantInt::get(Type::getInt32Ty(Ctx), (int)toCABI(Ordering2)); 1480 } 1481 bool HasResult = I->getType() != Type::getVoidTy(Ctx); 1482 1483 RTLIB::Libcall RTLibType; 1484 if (UseSizedLibcall) { 1485 switch (Size) { 1486 case 1: RTLibType = Libcalls[1]; break; 1487 case 2: RTLibType = Libcalls[2]; break; 1488 case 4: RTLibType = Libcalls[3]; break; 1489 case 8: RTLibType = Libcalls[4]; break; 1490 case 16: RTLibType = Libcalls[5]; break; 1491 } 1492 } else if (Libcalls[0] != RTLIB::UNKNOWN_LIBCALL) { 1493 RTLibType = Libcalls[0]; 1494 } else { 1495 // Can't use sized function, and there's no generic for this 1496 // operation, so give up. 1497 return false; 1498 } 1499 1500 // Build up the function call. There's two kinds. First, the sized 1501 // variants. These calls are going to be one of the following (with 1502 // N=1,2,4,8,16): 1503 // iN __atomic_load_N(iN *ptr, int ordering) 1504 // void __atomic_store_N(iN *ptr, iN val, int ordering) 1505 // iN __atomic_{exchange|fetch_*}_N(iN *ptr, iN val, int ordering) 1506 // bool __atomic_compare_exchange_N(iN *ptr, iN *expected, iN desired, 1507 // int success_order, int failure_order) 1508 // 1509 // Note that these functions can be used for non-integer atomic 1510 // operations, the values just need to be bitcast to integers on the 1511 // way in and out. 1512 // 1513 // And, then, the generic variants. They look like the following: 1514 // void __atomic_load(size_t size, void *ptr, void *ret, int ordering) 1515 // void __atomic_store(size_t size, void *ptr, void *val, int ordering) 1516 // void __atomic_exchange(size_t size, void *ptr, void *val, void *ret, 1517 // int ordering) 1518 // bool __atomic_compare_exchange(size_t size, void *ptr, void *expected, 1519 // void *desired, int success_order, 1520 // int failure_order) 1521 // 1522 // The different signatures are built up depending on the 1523 // 'UseSizedLibcall', 'CASExpected', 'ValueOperand', and 'HasResult' 1524 // variables. 1525 1526 AllocaInst *AllocaCASExpected = nullptr; 1527 Value *AllocaCASExpected_i8 = nullptr; 1528 AllocaInst *AllocaValue = nullptr; 1529 Value *AllocaValue_i8 = nullptr; 1530 AllocaInst *AllocaResult = nullptr; 1531 Value *AllocaResult_i8 = nullptr; 1532 1533 Type *ResultTy; 1534 SmallVector<Value *, 6> Args; 1535 AttributeSet Attr; 1536 1537 // 'size' argument. 1538 if (!UseSizedLibcall) { 1539 // Note, getIntPtrType is assumed equivalent to size_t. 1540 Args.push_back(ConstantInt::get(DL.getIntPtrType(Ctx), Size)); 1541 } 1542 1543 // 'ptr' argument. 1544 Value *PtrVal = 1545 Builder.CreateBitCast(PointerOperand, Type::getInt8PtrTy(Ctx)); 1546 Args.push_back(PtrVal); 1547 1548 // 'expected' argument, if present. 1549 if (CASExpected) { 1550 AllocaCASExpected = AllocaBuilder.CreateAlloca(CASExpected->getType()); 1551 AllocaCASExpected->setAlignment(AllocaAlignment); 1552 AllocaCASExpected_i8 = 1553 Builder.CreateBitCast(AllocaCASExpected, Type::getInt8PtrTy(Ctx)); 1554 Builder.CreateLifetimeStart(AllocaCASExpected_i8, SizeVal64); 1555 Builder.CreateAlignedStore(CASExpected, AllocaCASExpected, AllocaAlignment); 1556 Args.push_back(AllocaCASExpected_i8); 1557 } 1558 1559 // 'val' argument ('desired' for cas), if present. 1560 if (ValueOperand) { 1561 if (UseSizedLibcall) { 1562 Value *IntValue = 1563 Builder.CreateBitOrPointerCast(ValueOperand, SizedIntTy); 1564 Args.push_back(IntValue); 1565 } else { 1566 AllocaValue = AllocaBuilder.CreateAlloca(ValueOperand->getType()); 1567 AllocaValue->setAlignment(AllocaAlignment); 1568 AllocaValue_i8 = 1569 Builder.CreateBitCast(AllocaValue, Type::getInt8PtrTy(Ctx)); 1570 Builder.CreateLifetimeStart(AllocaValue_i8, SizeVal64); 1571 Builder.CreateAlignedStore(ValueOperand, AllocaValue, AllocaAlignment); 1572 Args.push_back(AllocaValue_i8); 1573 } 1574 } 1575 1576 // 'ret' argument. 1577 if (!CASExpected && HasResult && !UseSizedLibcall) { 1578 AllocaResult = AllocaBuilder.CreateAlloca(I->getType()); 1579 AllocaResult->setAlignment(AllocaAlignment); 1580 AllocaResult_i8 = 1581 Builder.CreateBitCast(AllocaResult, Type::getInt8PtrTy(Ctx)); 1582 Builder.CreateLifetimeStart(AllocaResult_i8, SizeVal64); 1583 Args.push_back(AllocaResult_i8); 1584 } 1585 1586 // 'ordering' ('success_order' for cas) argument. 1587 Args.push_back(OrderingVal); 1588 1589 // 'failure_order' argument, if present. 1590 if (Ordering2Val) 1591 Args.push_back(Ordering2Val); 1592 1593 // Now, the return type. 1594 if (CASExpected) { 1595 ResultTy = Type::getInt1Ty(Ctx); 1596 Attr = Attr.addAttribute(Ctx, AttributeSet::ReturnIndex, Attribute::ZExt); 1597 } else if (HasResult && UseSizedLibcall) 1598 ResultTy = SizedIntTy; 1599 else 1600 ResultTy = Type::getVoidTy(Ctx); 1601 1602 // Done with setting up arguments and return types, create the call: 1603 SmallVector<Type *, 6> ArgTys; 1604 for (Value *Arg : Args) 1605 ArgTys.push_back(Arg->getType()); 1606 FunctionType *FnType = FunctionType::get(ResultTy, ArgTys, false); 1607 Constant *LibcallFn = 1608 M->getOrInsertFunction(TLI->getLibcallName(RTLibType), FnType, Attr); 1609 CallInst *Call = Builder.CreateCall(LibcallFn, Args); 1610 Call->setAttributes(Attr); 1611 Value *Result = Call; 1612 1613 // And then, extract the results... 1614 if (ValueOperand && !UseSizedLibcall) 1615 Builder.CreateLifetimeEnd(AllocaValue_i8, SizeVal64); 1616 1617 if (CASExpected) { 1618 // The final result from the CAS is {load of 'expected' alloca, bool result 1619 // from call} 1620 Type *FinalResultTy = I->getType(); 1621 Value *V = UndefValue::get(FinalResultTy); 1622 Value *ExpectedOut = 1623 Builder.CreateAlignedLoad(AllocaCASExpected, AllocaAlignment); 1624 Builder.CreateLifetimeEnd(AllocaCASExpected_i8, SizeVal64); 1625 V = Builder.CreateInsertValue(V, ExpectedOut, 0); 1626 V = Builder.CreateInsertValue(V, Result, 1); 1627 I->replaceAllUsesWith(V); 1628 } else if (HasResult) { 1629 Value *V; 1630 if (UseSizedLibcall) 1631 V = Builder.CreateBitOrPointerCast(Result, I->getType()); 1632 else { 1633 V = Builder.CreateAlignedLoad(AllocaResult, AllocaAlignment); 1634 Builder.CreateLifetimeEnd(AllocaResult_i8, SizeVal64); 1635 } 1636 I->replaceAllUsesWith(V); 1637 } 1638 I->eraseFromParent(); 1639 return true; 1640 } 1641