1 //===- StringMatcher.cpp - Generate a matcher for input strings -----------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 // This file implements the StringMatcher class. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "StringMatcher.h" 15 #include "llvm/Support/raw_ostream.h" 16 #include <map> 17 using namespace llvm; 18 19 /// FindFirstNonCommonLetter - Find the first character in the keys of the 20 /// string pairs that is not shared across the whole set of strings. All 21 /// strings are assumed to have the same length. 22 static unsigned 23 FindFirstNonCommonLetter(const std::vector<const 24 StringMatcher::StringPair*> &Matches) { 25 assert(!Matches.empty()); 26 for (unsigned i = 0, e = Matches[0]->first.size(); i != e; ++i) { 27 // Check to see if letter i is the same across the set. 28 char Letter = Matches[0]->first[i]; 29 30 for (unsigned str = 0, e = Matches.size(); str != e; ++str) 31 if (Matches[str]->first[i] != Letter) 32 return i; 33 } 34 35 return Matches[0]->first.size(); 36 } 37 38 /// EmitStringMatcherForChar - Given a set of strings that are known to be the 39 /// same length and whose characters leading up to CharNo are the same, emit 40 /// code to verify that CharNo and later are the same. 41 /// 42 /// \return - True if control can leave the emitted code fragment. 43 bool StringMatcher:: 44 EmitStringMatcherForChar(const std::vector<const StringPair*> &Matches, 45 unsigned CharNo, unsigned IndentCount) const { 46 assert(!Matches.empty() && "Must have at least one string to match!"); 47 std::string Indent(IndentCount*2+4, ' '); 48 49 // If we have verified that the entire string matches, we're done: output the 50 // matching code. 51 if (CharNo == Matches[0]->first.size()) { 52 assert(Matches.size() == 1 && "Had duplicate keys to match on"); 53 54 // If the to-execute code has \n's in it, indent each subsequent line. 55 StringRef Code = Matches[0]->second; 56 57 std::pair<StringRef, StringRef> Split = Code.split('\n'); 58 OS << Indent << Split.first << "\t // \"" << Matches[0]->first << "\"\n"; 59 60 Code = Split.second; 61 while (!Code.empty()) { 62 Split = Code.split('\n'); 63 OS << Indent << Split.first << "\n"; 64 Code = Split.second; 65 } 66 return false; 67 } 68 69 // Bucket the matches by the character we are comparing. 70 std::map<char, std::vector<const StringPair*> > MatchesByLetter; 71 72 for (unsigned i = 0, e = Matches.size(); i != e; ++i) 73 MatchesByLetter[Matches[i]->first[CharNo]].push_back(Matches[i]); 74 75 76 // If we have exactly one bucket to match, see how many characters are common 77 // across the whole set and match all of them at once. 78 if (MatchesByLetter.size() == 1) { 79 unsigned FirstNonCommonLetter = FindFirstNonCommonLetter(Matches); 80 unsigned NumChars = FirstNonCommonLetter-CharNo; 81 82 // Emit code to break out if the prefix doesn't match. 83 if (NumChars == 1) { 84 // Do the comparison with if (Str[1] != 'f') 85 // FIXME: Need to escape general characters. 86 OS << Indent << "if (" << StrVariableName << "[" << CharNo << "] != '" 87 << Matches[0]->first[CharNo] << "')\n"; 88 OS << Indent << " break;\n"; 89 } else { 90 // Do the comparison with if (Str.substr(1, 3) != "foo"). 91 // FIXME: Need to escape general strings. 92 OS << Indent << "if (" << StrVariableName << ".substr(" << CharNo << ", " 93 << NumChars << ") != \""; 94 OS << Matches[0]->first.substr(CharNo, NumChars) << "\")\n"; 95 OS << Indent << " break;\n"; 96 } 97 98 return EmitStringMatcherForChar(Matches, FirstNonCommonLetter, IndentCount); 99 } 100 101 // Otherwise, we have multiple possible things, emit a switch on the 102 // character. 103 OS << Indent << "switch (" << StrVariableName << "[" << CharNo << "]) {\n"; 104 OS << Indent << "default: break;\n"; 105 106 for (std::map<char, std::vector<const StringPair*> >::iterator LI = 107 MatchesByLetter.begin(), E = MatchesByLetter.end(); LI != E; ++LI) { 108 // TODO: escape hard stuff (like \n) if we ever care about it. 109 OS << Indent << "case '" << LI->first << "':\t // " 110 << LI->second.size() << " string"; 111 if (LI->second.size() != 1) OS << 's'; 112 OS << " to match.\n"; 113 if (EmitStringMatcherForChar(LI->second, CharNo+1, IndentCount+1)) 114 OS << Indent << " break;\n"; 115 } 116 117 OS << Indent << "}\n"; 118 return true; 119 } 120 121 122 /// Emit - Top level entry point. 123 /// 124 void StringMatcher::Emit(unsigned Indent) const { 125 // If nothing to match, just fall through. 126 if (Matches.empty()) return; 127 128 // First level categorization: group strings by length. 129 std::map<unsigned, std::vector<const StringPair*> > MatchesByLength; 130 131 for (unsigned i = 0, e = Matches.size(); i != e; ++i) 132 MatchesByLength[Matches[i].first.size()].push_back(&Matches[i]); 133 134 // Output a switch statement on length and categorize the elements within each 135 // bin. 136 OS.indent(Indent*2+2) << "switch (" << StrVariableName << ".size()) {\n"; 137 OS.indent(Indent*2+2) << "default: break;\n"; 138 139 for (std::map<unsigned, std::vector<const StringPair*> >::iterator LI = 140 MatchesByLength.begin(), E = MatchesByLength.end(); LI != E; ++LI) { 141 OS.indent(Indent*2+2) << "case " << LI->first << ":\t // " 142 << LI->second.size() 143 << " string" << (LI->second.size() == 1 ? "" : "s") << " to match.\n"; 144 if (EmitStringMatcherForChar(LI->second, 0, Indent)) 145 OS.indent(Indent*2+4) << "break;\n"; 146 } 147 148 OS.indent(Indent*2+2) << "}\n"; 149 } 150