1 //===- FuzzerCorpus.h - Internal header for the Fuzzer ----------*- C++ -* ===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // fuzzer::InputCorpus 10 //===----------------------------------------------------------------------===// 11 12 #ifndef LLVM_FUZZER_CORPUS 13 #define LLVM_FUZZER_CORPUS 14 15 #include "FuzzerDefs.h" 16 #include "FuzzerIO.h" 17 #include "FuzzerRandom.h" 18 #include "FuzzerSHA1.h" 19 #include "FuzzerTracePC.h" 20 #include <algorithm> 21 #include <numeric> 22 #include <random> 23 #include <unordered_set> 24 25 namespace fuzzer { 26 27 struct InputInfo { 28 Unit U; // The actual input data. 29 uint8_t Sha1[kSHA1NumBytes]; // Checksum. 30 // Number of features that this input has and no smaller input has. 31 size_t NumFeatures = 0; 32 size_t Tmp = 0; // Used by ValidateFeatureSet. 33 // Stats. 34 size_t NumExecutedMutations = 0; 35 size_t NumSuccessfullMutations = 0; 36 bool MayDeleteFile = false; 37 bool Reduced = false; 38 Vector<uint32_t> UniqFeatureSet; 39 float FeatureFrequencyScore = 1.0; 40 }; 41 42 class InputCorpus { 43 static const size_t kFeatureSetSize = 1 << 21; 44 public: 45 InputCorpus(const std::string &OutputCorpus) : OutputCorpus(OutputCorpus) { 46 memset(InputSizesPerFeature, 0, sizeof(InputSizesPerFeature)); 47 memset(SmallestElementPerFeature, 0, sizeof(SmallestElementPerFeature)); 48 memset(FeatureFrequency, 0, sizeof(FeatureFrequency)); 49 } 50 ~InputCorpus() { 51 for (auto II : Inputs) 52 delete II; 53 } 54 size_t size() const { return Inputs.size(); } 55 size_t SizeInBytes() const { 56 size_t Res = 0; 57 for (auto II : Inputs) 58 Res += II->U.size(); 59 return Res; 60 } 61 size_t NumActiveUnits() const { 62 size_t Res = 0; 63 for (auto II : Inputs) 64 Res += !II->U.empty(); 65 return Res; 66 } 67 size_t MaxInputSize() const { 68 size_t Res = 0; 69 for (auto II : Inputs) 70 Res = std::max(Res, II->U.size()); 71 return Res; 72 } 73 bool empty() const { return Inputs.empty(); } 74 const Unit &operator[] (size_t Idx) const { return Inputs[Idx]->U; } 75 void AddToCorpus(const Unit &U, size_t NumFeatures, bool MayDeleteFile, 76 const Vector<uint32_t> &FeatureSet) { 77 assert(!U.empty()); 78 if (FeatureDebug) 79 Printf("ADD_TO_CORPUS %zd NF %zd\n", Inputs.size(), NumFeatures); 80 Inputs.push_back(new InputInfo()); 81 InputInfo &II = *Inputs.back(); 82 II.U = U; 83 II.NumFeatures = NumFeatures; 84 II.MayDeleteFile = MayDeleteFile; 85 II.UniqFeatureSet = FeatureSet; 86 std::sort(II.UniqFeatureSet.begin(), II.UniqFeatureSet.end()); 87 ComputeSHA1(U.data(), U.size(), II.Sha1); 88 Hashes.insert(Sha1ToString(II.Sha1)); 89 UpdateCorpusDistribution(); 90 PrintCorpus(); 91 // ValidateFeatureSet(); 92 } 93 94 // Debug-only 95 void PrintUnit(const Unit &U) { 96 if (!FeatureDebug) return; 97 for (uint8_t C : U) { 98 if (C != 'F' && C != 'U' && C != 'Z') 99 C = '.'; 100 Printf("%c", C); 101 } 102 } 103 104 // Debug-only 105 void PrintFeatureSet(const Vector<uint32_t> &FeatureSet) { 106 if (!FeatureDebug) return; 107 Printf("{"); 108 for (uint32_t Feature: FeatureSet) 109 Printf("%u,", Feature); 110 Printf("}"); 111 } 112 113 // Debug-only 114 void PrintCorpus() { 115 if (!FeatureDebug) return; 116 Printf("======= CORPUS:\n"); 117 int i = 0; 118 for (auto II : Inputs) { 119 if (std::find(II->U.begin(), II->U.end(), 'F') != II->U.end()) { 120 Printf("[%2d] ", i); 121 Printf("%s sz=%zd ", Sha1ToString(II->Sha1).c_str(), II->U.size()); 122 PrintUnit(II->U); 123 Printf(" "); 124 PrintFeatureSet(II->UniqFeatureSet); 125 Printf("\n"); 126 } 127 i++; 128 } 129 } 130 131 void Replace(InputInfo *II, const Unit &U) { 132 assert(II->U.size() > U.size()); 133 Hashes.erase(Sha1ToString(II->Sha1)); 134 DeleteFile(*II); 135 ComputeSHA1(U.data(), U.size(), II->Sha1); 136 Hashes.insert(Sha1ToString(II->Sha1)); 137 II->U = U; 138 II->Reduced = true; 139 UpdateCorpusDistribution(); 140 } 141 142 bool HasUnit(const Unit &U) { return Hashes.count(Hash(U)); } 143 bool HasUnit(const std::string &H) { return Hashes.count(H); } 144 InputInfo &ChooseUnitToMutate(Random &Rand) { 145 InputInfo &II = *Inputs[ChooseUnitIdxToMutate(Rand)]; 146 assert(!II.U.empty()); 147 return II; 148 }; 149 150 // Returns an index of random unit from the corpus to mutate. 151 size_t ChooseUnitIdxToMutate(Random &Rand) { 152 size_t Idx = static_cast<size_t>(CorpusDistribution(Rand)); 153 assert(Idx < Inputs.size()); 154 return Idx; 155 } 156 157 void PrintStats() { 158 for (size_t i = 0; i < Inputs.size(); i++) { 159 const auto &II = *Inputs[i]; 160 Printf(" [%zd %s]\tsz: %zd\truns: %zd\tsucc: %zd\n", i, 161 Sha1ToString(II.Sha1).c_str(), II.U.size(), 162 II.NumExecutedMutations, II.NumSuccessfullMutations); 163 } 164 } 165 166 void PrintFeatureSet() { 167 for (size_t i = 0; i < kFeatureSetSize; i++) { 168 if(size_t Sz = GetFeature(i)) 169 Printf("[%zd: id %zd sz%zd] ", i, SmallestElementPerFeature[i], Sz); 170 } 171 Printf("\n\t"); 172 for (size_t i = 0; i < Inputs.size(); i++) 173 if (size_t N = Inputs[i]->NumFeatures) 174 Printf(" %zd=>%zd ", i, N); 175 Printf("\n"); 176 } 177 178 void DeleteFile(const InputInfo &II) { 179 if (!OutputCorpus.empty() && II.MayDeleteFile) 180 RemoveFile(DirPlusFile(OutputCorpus, Sha1ToString(II.Sha1))); 181 } 182 183 void DeleteInput(size_t Idx) { 184 InputInfo &II = *Inputs[Idx]; 185 DeleteFile(II); 186 Unit().swap(II.U); 187 if (FeatureDebug) 188 Printf("EVICTED %zd\n", Idx); 189 } 190 191 bool AddFeature(size_t Idx, uint32_t NewSize, bool Shrink) { 192 assert(NewSize); 193 Idx = Idx % kFeatureSetSize; 194 uint32_t OldSize = GetFeature(Idx); 195 if (OldSize == 0 || (Shrink && OldSize > NewSize)) { 196 if (OldSize > 0) { 197 size_t OldIdx = SmallestElementPerFeature[Idx]; 198 InputInfo &II = *Inputs[OldIdx]; 199 assert(II.NumFeatures > 0); 200 II.NumFeatures--; 201 if (II.NumFeatures == 0) 202 DeleteInput(OldIdx); 203 } else { 204 NumAddedFeatures++; 205 } 206 NumUpdatedFeatures++; 207 if (FeatureDebug) 208 Printf("ADD FEATURE %zd sz %d\n", Idx, NewSize); 209 SmallestElementPerFeature[Idx] = Inputs.size(); 210 InputSizesPerFeature[Idx] = NewSize; 211 return true; 212 } 213 return false; 214 } 215 216 void UpdateFeatureFrequency(size_t Idx) { 217 FeatureFrequency[Idx % kFeatureSetSize]++; 218 } 219 float GetFeatureFrequency(size_t Idx) const { 220 return FeatureFrequency[Idx % kFeatureSetSize]; 221 } 222 void UpdateFeatureFrequencyScore(InputInfo *II) { 223 const float kMin = 0.01, kMax = 100.; 224 II->FeatureFrequencyScore = kMin; 225 for (auto Idx : II->UniqFeatureSet) 226 II->FeatureFrequencyScore += 1. / (GetFeatureFrequency(Idx) + 1.); 227 II->FeatureFrequencyScore = Min(II->FeatureFrequencyScore, kMax); 228 } 229 230 size_t NumFeatures() const { return NumAddedFeatures; } 231 size_t NumFeatureUpdates() const { return NumUpdatedFeatures; } 232 233 private: 234 235 static const bool FeatureDebug = false; 236 237 size_t GetFeature(size_t Idx) const { return InputSizesPerFeature[Idx]; } 238 239 void ValidateFeatureSet() { 240 if (FeatureDebug) 241 PrintFeatureSet(); 242 for (size_t Idx = 0; Idx < kFeatureSetSize; Idx++) 243 if (GetFeature(Idx)) 244 Inputs[SmallestElementPerFeature[Idx]]->Tmp++; 245 for (auto II: Inputs) { 246 if (II->Tmp != II->NumFeatures) 247 Printf("ZZZ %zd %zd\n", II->Tmp, II->NumFeatures); 248 assert(II->Tmp == II->NumFeatures); 249 II->Tmp = 0; 250 } 251 } 252 253 // Updates the probability distribution for the units in the corpus. 254 // Must be called whenever the corpus or unit weights are changed. 255 // 256 // Hypothesis: units added to the corpus last are more interesting. 257 // 258 // Hypothesis: inputs with infrequent features are more interesting. 259 void UpdateCorpusDistribution() { 260 size_t N = Inputs.size(); 261 assert(N); 262 Intervals.resize(N + 1); 263 Weights.resize(N); 264 std::iota(Intervals.begin(), Intervals.end(), 0); 265 for (size_t i = 0; i < N; i++) 266 Weights[i] = Inputs[i]->NumFeatures 267 ? (i + 1) * Inputs[i]->FeatureFrequencyScore 268 : 0.; 269 if (FeatureDebug) { 270 for (size_t i = 0; i < N; i++) 271 Printf("%zd ", Inputs[i]->NumFeatures); 272 Printf("NUM\n"); 273 for (size_t i = 0; i < N; i++) 274 Printf("%f ", Inputs[i]->FeatureFrequencyScore); 275 Printf("SCORE\n"); 276 for (size_t i = 0; i < N; i++) 277 Printf("%f ", Weights[i]); 278 Printf("Weights\n"); 279 } 280 CorpusDistribution = std::piecewise_constant_distribution<double>( 281 Intervals.begin(), Intervals.end(), Weights.begin()); 282 } 283 std::piecewise_constant_distribution<double> CorpusDistribution; 284 285 Vector<double> Intervals; 286 Vector<double> Weights; 287 288 std::unordered_set<std::string> Hashes; 289 Vector<InputInfo*> Inputs; 290 291 size_t NumAddedFeatures = 0; 292 size_t NumUpdatedFeatures = 0; 293 uint32_t InputSizesPerFeature[kFeatureSetSize]; 294 uint32_t SmallestElementPerFeature[kFeatureSetSize]; 295 float FeatureFrequency[kFeatureSetSize]; 296 297 std::string OutputCorpus; 298 }; 299 300 } // namespace fuzzer 301 302 #endif // LLVM_FUZZER_CORPUS 303