1 //===- lib/MC/MCObjectDisassembler.cpp ------------------------------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 10 #include "llvm/MC/MCObjectDisassembler.h" 11 #include "llvm/ADT/SetVector.h" 12 #include "llvm/ADT/SmallPtrSet.h" 13 #include "llvm/ADT/StringExtras.h" 14 #include "llvm/ADT/StringRef.h" 15 #include "llvm/ADT/Twine.h" 16 #include "llvm/MC/MCAnalysis/MCAtom.h" 17 #include "llvm/MC/MCAnalysis/MCFunction.h" 18 #include "llvm/MC/MCAnalysis/MCModule.h" 19 #include "llvm/MC/MCDisassembler.h" 20 #include "llvm/MC/MCInstrAnalysis.h" 21 #include "llvm/MC/MCObjectSymbolizer.h" 22 #include "llvm/Object/MachO.h" 23 #include "llvm/Object/ObjectFile.h" 24 #include "llvm/Support/Debug.h" 25 #include "llvm/Support/MachO.h" 26 #include "llvm/Support/MemoryObject.h" 27 #include "llvm/Support/StringRefMemoryObject.h" 28 #include "llvm/Support/raw_ostream.h" 29 #include <map> 30 31 using namespace llvm; 32 using namespace object; 33 34 #define DEBUG_TYPE "mc" 35 36 MCObjectDisassembler::MCObjectDisassembler(const ObjectFile &Obj, 37 const MCDisassembler &Dis, 38 const MCInstrAnalysis &MIA) 39 : Obj(Obj), Dis(Dis), MIA(MIA), MOS(nullptr) {} 40 41 uint64_t MCObjectDisassembler::getEntrypoint() { 42 for (const SymbolRef &Symbol : Obj.symbols()) { 43 StringRef Name; 44 Symbol.getName(Name); 45 if (Name == "main" || Name == "_main") { 46 uint64_t Entrypoint; 47 Symbol.getAddress(Entrypoint); 48 return getEffectiveLoadAddr(Entrypoint); 49 } 50 } 51 return 0; 52 } 53 54 ArrayRef<uint64_t> MCObjectDisassembler::getStaticInitFunctions() { 55 return ArrayRef<uint64_t>(); 56 } 57 58 ArrayRef<uint64_t> MCObjectDisassembler::getStaticExitFunctions() { 59 return ArrayRef<uint64_t>(); 60 } 61 62 MemoryObject *MCObjectDisassembler::getRegionFor(uint64_t Addr) { 63 // FIXME: Keep track of object sections. 64 return FallbackRegion.get(); 65 } 66 67 uint64_t MCObjectDisassembler::getEffectiveLoadAddr(uint64_t Addr) { 68 return Addr; 69 } 70 71 uint64_t MCObjectDisassembler::getOriginalLoadAddr(uint64_t Addr) { 72 return Addr; 73 } 74 75 MCModule *MCObjectDisassembler::buildEmptyModule() { 76 MCModule *Module = new MCModule; 77 Module->Entrypoint = getEntrypoint(); 78 return Module; 79 } 80 81 MCModule *MCObjectDisassembler::buildModule(bool withCFG) { 82 MCModule *Module = buildEmptyModule(); 83 84 buildSectionAtoms(Module); 85 if (withCFG) 86 buildCFG(Module); 87 return Module; 88 } 89 90 void MCObjectDisassembler::buildSectionAtoms(MCModule *Module) { 91 for (const SectionRef &Section : Obj.sections()) { 92 bool isText; 93 Section.isText(isText); 94 bool isData; 95 Section.isData(isData); 96 if (!isData && !isText) 97 continue; 98 99 uint64_t StartAddr; 100 Section.getAddress(StartAddr); 101 uint64_t SecSize; 102 Section.getSize(SecSize); 103 if (StartAddr == UnknownAddressOrSize || SecSize == UnknownAddressOrSize) 104 continue; 105 StartAddr = getEffectiveLoadAddr(StartAddr); 106 107 StringRef Contents; 108 Section.getContents(Contents); 109 StringRefMemoryObject memoryObject(Contents, StartAddr); 110 111 // We don't care about things like non-file-backed sections yet. 112 if (Contents.size() != SecSize || !SecSize) 113 continue; 114 uint64_t EndAddr = StartAddr + SecSize - 1; 115 116 StringRef SecName; 117 Section.getName(SecName); 118 119 if (isText) { 120 MCTextAtom *Text = nullptr; 121 MCDataAtom *InvalidData = nullptr; 122 123 uint64_t InstSize; 124 for (uint64_t Index = 0; Index < SecSize; Index += InstSize) { 125 const uint64_t CurAddr = StartAddr + Index; 126 MCInst Inst; 127 if (Dis.getInstruction(Inst, InstSize, memoryObject, CurAddr, nulls(), 128 nulls())) { 129 if (!Text) { 130 Text = Module->createTextAtom(CurAddr, CurAddr); 131 Text->setName(SecName); 132 } 133 Text->addInst(Inst, InstSize); 134 InvalidData = nullptr; 135 } else { 136 assert(InstSize && "getInstruction() consumed no bytes"); 137 if (!InvalidData) { 138 Text = nullptr; 139 InvalidData = Module->createDataAtom(CurAddr, CurAddr+InstSize - 1); 140 } 141 for (uint64_t I = 0; I < InstSize; ++I) 142 InvalidData->addData(Contents[Index+I]); 143 } 144 } 145 } else { 146 MCDataAtom *Data = Module->createDataAtom(StartAddr, EndAddr); 147 Data->setName(SecName); 148 for (uint64_t Index = 0; Index < SecSize; ++Index) 149 Data->addData(Contents[Index]); 150 } 151 } 152 } 153 154 namespace { 155 struct BBInfo; 156 typedef SmallPtrSet<BBInfo*, 2> BBInfoSetTy; 157 158 struct BBInfo { 159 MCTextAtom *Atom; 160 MCBasicBlock *BB; 161 BBInfoSetTy Succs; 162 BBInfoSetTy Preds; 163 MCObjectDisassembler::AddressSetTy SuccAddrs; 164 165 BBInfo() : Atom(nullptr), BB(nullptr) {} 166 167 void addSucc(BBInfo &Succ) { 168 Succs.insert(&Succ); 169 Succ.Preds.insert(this); 170 } 171 }; 172 } 173 174 static void RemoveDupsFromAddressVector(MCObjectDisassembler::AddressSetTy &V) { 175 std::sort(V.begin(), V.end()); 176 V.erase(std::unique(V.begin(), V.end()), V.end()); 177 } 178 179 void MCObjectDisassembler::buildCFG(MCModule *Module) { 180 typedef std::map<uint64_t, BBInfo> BBInfoByAddrTy; 181 BBInfoByAddrTy BBInfos; 182 AddressSetTy Splits; 183 AddressSetTy Calls; 184 185 for (const SymbolRef &Symbol : Obj.symbols()) { 186 SymbolRef::Type SymType; 187 Symbol.getType(SymType); 188 if (SymType == SymbolRef::ST_Function) { 189 uint64_t SymAddr; 190 Symbol.getAddress(SymAddr); 191 SymAddr = getEffectiveLoadAddr(SymAddr); 192 Calls.push_back(SymAddr); 193 Splits.push_back(SymAddr); 194 } 195 } 196 197 assert(Module->func_begin() == Module->func_end() 198 && "Module already has a CFG!"); 199 200 // First, determine the basic block boundaries and call targets. 201 for (MCModule::atom_iterator AI = Module->atom_begin(), 202 AE = Module->atom_end(); 203 AI != AE; ++AI) { 204 MCTextAtom *TA = dyn_cast<MCTextAtom>(*AI); 205 if (!TA) continue; 206 Calls.push_back(TA->getBeginAddr()); 207 BBInfos[TA->getBeginAddr()].Atom = TA; 208 for (MCTextAtom::const_iterator II = TA->begin(), IE = TA->end(); 209 II != IE; ++II) { 210 if (MIA.isTerminator(II->Inst)) 211 Splits.push_back(II->Address + II->Size); 212 uint64_t Target; 213 if (MIA.evaluateBranch(II->Inst, II->Address, II->Size, Target)) { 214 if (MIA.isCall(II->Inst)) 215 Calls.push_back(Target); 216 Splits.push_back(Target); 217 } 218 } 219 } 220 221 RemoveDupsFromAddressVector(Splits); 222 RemoveDupsFromAddressVector(Calls); 223 224 // Split text atoms into basic block atoms. 225 for (AddressSetTy::const_iterator SI = Splits.begin(), SE = Splits.end(); 226 SI != SE; ++SI) { 227 MCAtom *A = Module->findAtomContaining(*SI); 228 if (!A) continue; 229 MCTextAtom *TA = cast<MCTextAtom>(A); 230 if (TA->getBeginAddr() == *SI) 231 continue; 232 MCTextAtom *NewAtom = TA->split(*SI); 233 BBInfos[NewAtom->getBeginAddr()].Atom = NewAtom; 234 StringRef BBName = TA->getName(); 235 BBName = BBName.substr(0, BBName.find_last_of(':')); 236 NewAtom->setName((BBName + ":" + utohexstr(*SI)).str()); 237 } 238 239 // Compute succs/preds. 240 for (MCModule::atom_iterator AI = Module->atom_begin(), 241 AE = Module->atom_end(); 242 AI != AE; ++AI) { 243 MCTextAtom *TA = dyn_cast<MCTextAtom>(*AI); 244 if (!TA) continue; 245 BBInfo &CurBB = BBInfos[TA->getBeginAddr()]; 246 const MCDecodedInst &LI = TA->back(); 247 if (MIA.isBranch(LI.Inst)) { 248 uint64_t Target; 249 if (MIA.evaluateBranch(LI.Inst, LI.Address, LI.Size, Target)) 250 CurBB.addSucc(BBInfos[Target]); 251 if (MIA.isConditionalBranch(LI.Inst)) 252 CurBB.addSucc(BBInfos[LI.Address + LI.Size]); 253 } else if (!MIA.isTerminator(LI.Inst)) 254 CurBB.addSucc(BBInfos[LI.Address + LI.Size]); 255 } 256 257 258 // Create functions and basic blocks. 259 for (AddressSetTy::const_iterator CI = Calls.begin(), CE = Calls.end(); 260 CI != CE; ++CI) { 261 BBInfo &BBI = BBInfos[*CI]; 262 if (!BBI.Atom) continue; 263 264 MCFunction &MCFN = *Module->createFunction(BBI.Atom->getName()); 265 266 // Create MCBBs. 267 SmallSetVector<BBInfo*, 16> Worklist; 268 Worklist.insert(&BBI); 269 for (size_t wi = 0; wi < Worklist.size(); ++wi) { 270 BBInfo *BBI = Worklist[wi]; 271 if (!BBI->Atom) 272 continue; 273 BBI->BB = &MCFN.createBlock(*BBI->Atom); 274 // Add all predecessors and successors to the worklist. 275 for (BBInfoSetTy::iterator SI = BBI->Succs.begin(), SE = BBI->Succs.end(); 276 SI != SE; ++SI) 277 Worklist.insert(*SI); 278 for (BBInfoSetTy::iterator PI = BBI->Preds.begin(), PE = BBI->Preds.end(); 279 PI != PE; ++PI) 280 Worklist.insert(*PI); 281 } 282 283 // Set preds/succs. 284 for (size_t wi = 0; wi < Worklist.size(); ++wi) { 285 BBInfo *BBI = Worklist[wi]; 286 MCBasicBlock *MCBB = BBI->BB; 287 if (!MCBB) 288 continue; 289 for (BBInfoSetTy::iterator SI = BBI->Succs.begin(), SE = BBI->Succs.end(); 290 SI != SE; ++SI) 291 if ((*SI)->BB) 292 MCBB->addSuccessor((*SI)->BB); 293 for (BBInfoSetTy::iterator PI = BBI->Preds.begin(), PE = BBI->Preds.end(); 294 PI != PE; ++PI) 295 if ((*PI)->BB) 296 MCBB->addPredecessor((*PI)->BB); 297 } 298 } 299 } 300 301 // Basic idea of the disassembly + discovery: 302 // 303 // start with the wanted address, insert it in the worklist 304 // while worklist not empty, take next address in the worklist: 305 // - check if atom exists there 306 // - if middle of atom: 307 // - split basic blocks referencing the atom 308 // - look for an already encountered BBInfo (using a map<atom, bbinfo>) 309 // - if there is, split it (new one, fallthrough, move succs, etc..) 310 // - if start of atom: nothing else to do 311 // - if no atom: create new atom and new bbinfo 312 // - look at the last instruction in the atom, add succs to worklist 313 // for all elements in the worklist: 314 // - create basic block, update preds/succs, etc.. 315 // 316 MCBasicBlock *MCObjectDisassembler::getBBAt(MCModule *Module, MCFunction *MCFN, 317 uint64_t BBBeginAddr, 318 AddressSetTy &CallTargets, 319 AddressSetTy &TailCallTargets) { 320 typedef std::map<uint64_t, BBInfo> BBInfoByAddrTy; 321 typedef SmallSetVector<uint64_t, 16> AddrWorklistTy; 322 BBInfoByAddrTy BBInfos; 323 AddrWorklistTy Worklist; 324 325 Worklist.insert(BBBeginAddr); 326 for (size_t wi = 0; wi < Worklist.size(); ++wi) { 327 const uint64_t BeginAddr = Worklist[wi]; 328 BBInfo *BBI = &BBInfos[BeginAddr]; 329 330 MCTextAtom *&TA = BBI->Atom; 331 assert(!TA && "Discovered basic block already has an associated atom!"); 332 333 // Look for an atom at BeginAddr. 334 if (MCAtom *A = Module->findAtomContaining(BeginAddr)) { 335 // FIXME: We don't care about mixed atoms, see above. 336 TA = cast<MCTextAtom>(A); 337 338 // The found atom doesn't begin at BeginAddr, we have to split it. 339 if (TA->getBeginAddr() != BeginAddr) { 340 // FIXME: Handle overlapping atoms: middle-starting instructions, etc.. 341 MCTextAtom *NewTA = TA->split(BeginAddr); 342 343 // Look for an already encountered basic block that needs splitting 344 BBInfoByAddrTy::iterator It = BBInfos.find(TA->getBeginAddr()); 345 if (It != BBInfos.end() && It->second.Atom) { 346 BBI->SuccAddrs = It->second.SuccAddrs; 347 It->second.SuccAddrs.clear(); 348 It->second.SuccAddrs.push_back(BeginAddr); 349 } 350 TA = NewTA; 351 } 352 BBI->Atom = TA; 353 } else { 354 // If we didn't find an atom, then we have to disassemble to create one! 355 356 MemoryObject *Region = getRegionFor(BeginAddr); 357 if (!Region) 358 llvm_unreachable(("Couldn't find suitable region for disassembly at " + 359 utostr(BeginAddr)).c_str()); 360 361 uint64_t InstSize; 362 uint64_t EndAddr = Region->getBase() + Region->getExtent(); 363 364 // We want to stop before the next atom and have a fallthrough to it. 365 if (MCTextAtom *NextAtom = 366 cast_or_null<MCTextAtom>(Module->findFirstAtomAfter(BeginAddr))) 367 EndAddr = std::min(EndAddr, NextAtom->getBeginAddr()); 368 369 for (uint64_t Addr = BeginAddr; Addr < EndAddr; Addr += InstSize) { 370 MCInst Inst; 371 if (Dis.getInstruction(Inst, InstSize, *Region, Addr, nulls(), 372 nulls())) { 373 if (!TA) 374 TA = Module->createTextAtom(Addr, Addr); 375 TA->addInst(Inst, InstSize); 376 } else { 377 // We don't care about splitting mixed atoms either. 378 llvm_unreachable("Couldn't disassemble instruction in atom."); 379 } 380 381 uint64_t BranchTarget; 382 if (MIA.evaluateBranch(Inst, Addr, InstSize, BranchTarget)) { 383 if (MIA.isCall(Inst)) 384 CallTargets.push_back(BranchTarget); 385 } 386 387 if (MIA.isTerminator(Inst)) 388 break; 389 } 390 BBI->Atom = TA; 391 } 392 393 assert(TA && "Couldn't disassemble atom, none was created!"); 394 assert(TA->begin() != TA->end() && "Empty atom!"); 395 396 MemoryObject *Region = getRegionFor(TA->getBeginAddr()); 397 assert(Region && "Couldn't find region for already disassembled code!"); 398 uint64_t EndRegion = Region->getBase() + Region->getExtent(); 399 400 // Now we have a basic block atom, add successors. 401 // Add the fallthrough block. 402 if ((MIA.isConditionalBranch(TA->back().Inst) || 403 !MIA.isTerminator(TA->back().Inst)) && 404 (TA->getEndAddr() + 1 < EndRegion)) { 405 BBI->SuccAddrs.push_back(TA->getEndAddr() + 1); 406 Worklist.insert(TA->getEndAddr() + 1); 407 } 408 409 // If the terminator is a branch, add the target block. 410 if (MIA.isBranch(TA->back().Inst)) { 411 uint64_t BranchTarget; 412 if (MIA.evaluateBranch(TA->back().Inst, TA->back().Address, 413 TA->back().Size, BranchTarget)) { 414 StringRef ExtFnName; 415 if (MOS) 416 ExtFnName = 417 MOS->findExternalFunctionAt(getOriginalLoadAddr(BranchTarget)); 418 if (!ExtFnName.empty()) { 419 TailCallTargets.push_back(BranchTarget); 420 CallTargets.push_back(BranchTarget); 421 } else { 422 BBI->SuccAddrs.push_back(BranchTarget); 423 Worklist.insert(BranchTarget); 424 } 425 } 426 } 427 } 428 429 for (size_t wi = 0, we = Worklist.size(); wi != we; ++wi) { 430 const uint64_t BeginAddr = Worklist[wi]; 431 BBInfo *BBI = &BBInfos[BeginAddr]; 432 433 assert(BBI->Atom && "Found a basic block without an associated atom!"); 434 435 // Look for a basic block at BeginAddr. 436 BBI->BB = MCFN->find(BeginAddr); 437 if (BBI->BB) { 438 // FIXME: check that the succs/preds are the same 439 continue; 440 } 441 // If there was none, we have to create one from the atom. 442 BBI->BB = &MCFN->createBlock(*BBI->Atom); 443 } 444 445 for (size_t wi = 0, we = Worklist.size(); wi != we; ++wi) { 446 const uint64_t BeginAddr = Worklist[wi]; 447 BBInfo *BBI = &BBInfos[BeginAddr]; 448 MCBasicBlock *BB = BBI->BB; 449 450 RemoveDupsFromAddressVector(BBI->SuccAddrs); 451 for (AddressSetTy::const_iterator SI = BBI->SuccAddrs.begin(), 452 SE = BBI->SuccAddrs.end(); 453 SE != SE; ++SI) { 454 MCBasicBlock *Succ = BBInfos[*SI].BB; 455 BB->addSuccessor(Succ); 456 Succ->addPredecessor(BB); 457 } 458 } 459 460 assert(BBInfos[Worklist[0]].BB && 461 "No basic block created at requested address?"); 462 463 return BBInfos[Worklist[0]].BB; 464 } 465 466 MCFunction * 467 MCObjectDisassembler::createFunction(MCModule *Module, uint64_t BeginAddr, 468 AddressSetTy &CallTargets, 469 AddressSetTy &TailCallTargets) { 470 // First, check if this is an external function. 471 StringRef ExtFnName; 472 if (MOS) 473 ExtFnName = MOS->findExternalFunctionAt(getOriginalLoadAddr(BeginAddr)); 474 if (!ExtFnName.empty()) 475 return Module->createFunction(ExtFnName); 476 477 // If it's not, look for an existing function. 478 for (MCModule::func_iterator FI = Module->func_begin(), 479 FE = Module->func_end(); 480 FI != FE; ++FI) { 481 if ((*FI)->empty()) 482 continue; 483 // FIXME: MCModule should provide a findFunctionByAddr() 484 if ((*FI)->getEntryBlock()->getInsts()->getBeginAddr() == BeginAddr) 485 return FI->get(); 486 } 487 488 // Finally, just create a new one. 489 MCFunction *MCFN = Module->createFunction(""); 490 getBBAt(Module, MCFN, BeginAddr, CallTargets, TailCallTargets); 491 return MCFN; 492 } 493 494 // MachO MCObjectDisassembler implementation. 495 496 MCMachOObjectDisassembler::MCMachOObjectDisassembler( 497 const MachOObjectFile &MOOF, const MCDisassembler &Dis, 498 const MCInstrAnalysis &MIA, uint64_t VMAddrSlide, 499 uint64_t HeaderLoadAddress) 500 : MCObjectDisassembler(MOOF, Dis, MIA), MOOF(MOOF), 501 VMAddrSlide(VMAddrSlide), HeaderLoadAddress(HeaderLoadAddress) { 502 503 for (const SectionRef &Section : MOOF.sections()) { 504 StringRef Name; 505 Section.getName(Name); 506 // FIXME: We should use the S_ section type instead of the name. 507 if (Name == "__mod_init_func") { 508 DEBUG(dbgs() << "Found __mod_init_func section!\n"); 509 Section.getContents(ModInitContents); 510 } else if (Name == "__mod_exit_func") { 511 DEBUG(dbgs() << "Found __mod_exit_func section!\n"); 512 Section.getContents(ModExitContents); 513 } 514 } 515 } 516 517 // FIXME: Only do the translations for addresses actually inside the object. 518 uint64_t MCMachOObjectDisassembler::getEffectiveLoadAddr(uint64_t Addr) { 519 return Addr + VMAddrSlide; 520 } 521 522 uint64_t 523 MCMachOObjectDisassembler::getOriginalLoadAddr(uint64_t EffectiveAddr) { 524 return EffectiveAddr - VMAddrSlide; 525 } 526 527 uint64_t MCMachOObjectDisassembler::getEntrypoint() { 528 uint64_t EntryFileOffset = 0; 529 530 // Look for LC_MAIN. 531 { 532 uint32_t LoadCommandCount = MOOF.getHeader().ncmds; 533 MachOObjectFile::LoadCommandInfo Load = MOOF.getFirstLoadCommandInfo(); 534 for (unsigned I = 0;; ++I) { 535 if (Load.C.cmd == MachO::LC_MAIN) { 536 EntryFileOffset = 537 ((const MachO::entry_point_command *)Load.Ptr)->entryoff; 538 break; 539 } 540 541 if (I == LoadCommandCount - 1) 542 break; 543 else 544 Load = MOOF.getNextLoadCommandInfo(Load); 545 } 546 } 547 548 // If we didn't find anything, default to the common implementation. 549 // FIXME: Maybe we could also look at LC_UNIXTHREAD and friends? 550 if (EntryFileOffset) 551 return MCObjectDisassembler::getEntrypoint(); 552 553 return EntryFileOffset + HeaderLoadAddress; 554 } 555 556 ArrayRef<uint64_t> MCMachOObjectDisassembler::getStaticInitFunctions() { 557 // FIXME: We only handle 64bit mach-o 558 assert(MOOF.is64Bit()); 559 560 size_t EntrySize = 8; 561 size_t EntryCount = ModInitContents.size() / EntrySize; 562 return ArrayRef<uint64_t>( 563 reinterpret_cast<const uint64_t *>(ModInitContents.data()), EntryCount); 564 } 565 566 ArrayRef<uint64_t> MCMachOObjectDisassembler::getStaticExitFunctions() { 567 // FIXME: We only handle 64bit mach-o 568 assert(MOOF.is64Bit()); 569 570 size_t EntrySize = 8; 571 size_t EntryCount = ModExitContents.size() / EntrySize; 572 return ArrayRef<uint64_t>( 573 reinterpret_cast<const uint64_t *>(ModExitContents.data()), EntryCount); 574 } 575