1 //===-- Clustering.cpp ------------------------------------------*- C++ -*-===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 10 #include "Clustering.h" 11 #include <string> 12 #include <unordered_set> 13 14 namespace exegesis { 15 16 // The clustering problem has the following characteristics: 17 // (A) - Low dimension (dimensions are typically proc resource units, 18 // typically < 10). 19 // (B) - Number of points : ~thousands (points are measurements of an MCInst) 20 // (C) - Number of clusters: ~tens. 21 // (D) - The number of clusters is not known /a priory/. 22 // (E) - The amount of noise is relatively small. 23 // The problem is rather small. In terms of algorithms, (D) disqualifies 24 // k-means and makes algorithms such as DBSCAN[1] or OPTICS[2] more applicable. 25 // 26 // We've used DBSCAN here because it's simple to implement. This is a pretty 27 // straightforward and inefficient implementation of the pseudocode in [2]. 28 // 29 // [1] https://en.wikipedia.org/wiki/DBSCAN 30 // [2] https://en.wikipedia.org/wiki/OPTICS_algorithm 31 32 // Finds the points at distance less than sqrt(EpsilonSquared) of Q (not 33 // including Q). 34 std::vector<size_t> 35 InstructionBenchmarkClustering::rangeQuery(const size_t Q) const { 36 std::vector<size_t> Neighbors; 37 const auto &QMeasurements = Points_[Q].Measurements; 38 for (size_t P = 0, NumPoints = Points_.size(); P < NumPoints; ++P) { 39 if (P == Q) 40 continue; 41 const auto &PMeasurements = Points_[P].Measurements; 42 if (PMeasurements.empty()) // Error point. 43 continue; 44 if (isNeighbour(PMeasurements, QMeasurements)) { 45 Neighbors.push_back(P); 46 } 47 } 48 return Neighbors; 49 } 50 51 bool InstructionBenchmarkClustering::isNeighbour( 52 const std::vector<BenchmarkMeasure> &P, 53 const std::vector<BenchmarkMeasure> &Q) const { 54 double DistanceSquared = 0.0; 55 for (size_t I = 0, E = P.size(); I < E; ++I) { 56 const auto Diff = P[I].Value - Q[I].Value; 57 DistanceSquared += Diff * Diff; 58 } 59 return DistanceSquared <= EpsilonSquared_; 60 } 61 62 InstructionBenchmarkClustering::InstructionBenchmarkClustering( 63 const std::vector<InstructionBenchmark> &Points, 64 const double EpsilonSquared) 65 : Points_(Points), EpsilonSquared_(EpsilonSquared), 66 NoiseCluster_(ClusterId::noise()), ErrorCluster_(ClusterId::error()) {} 67 68 llvm::Error InstructionBenchmarkClustering::validateAndSetup() { 69 ClusterIdForPoint_.resize(Points_.size()); 70 // Mark erroneous measurements out. 71 // All points must have the same number of dimensions, in the same order. 72 const std::vector<BenchmarkMeasure> *LastMeasurement = nullptr; 73 for (size_t P = 0, NumPoints = Points_.size(); P < NumPoints; ++P) { 74 const auto &Point = Points_[P]; 75 if (!Point.Error.empty()) { 76 ClusterIdForPoint_[P] = ClusterId::error(); 77 ErrorCluster_.PointIndices.push_back(P); 78 continue; 79 } 80 const auto *CurMeasurement = &Point.Measurements; 81 if (LastMeasurement) { 82 if (LastMeasurement->size() != CurMeasurement->size()) { 83 return llvm::make_error<llvm::StringError>( 84 "inconsistent measurement dimensions", 85 llvm::inconvertibleErrorCode()); 86 } 87 for (size_t I = 0, E = LastMeasurement->size(); I < E; ++I) { 88 if (LastMeasurement->at(I).Key != CurMeasurement->at(I).Key) { 89 return llvm::make_error<llvm::StringError>( 90 "inconsistent measurement dimensions keys", 91 llvm::inconvertibleErrorCode()); 92 } 93 } 94 } 95 LastMeasurement = CurMeasurement; 96 } 97 if (LastMeasurement) { 98 NumDimensions_ = LastMeasurement->size(); 99 } 100 return llvm::Error::success(); 101 } 102 103 void InstructionBenchmarkClustering::dbScan(const size_t MinPts) { 104 for (size_t P = 0, NumPoints = Points_.size(); P < NumPoints; ++P) { 105 if (!ClusterIdForPoint_[P].isUndef()) 106 continue; // Previously processed in inner loop. 107 const auto Neighbors = rangeQuery(P); 108 if (Neighbors.size() + 1 < MinPts) { // Density check. 109 // The region around P is not dense enough to create a new cluster, mark 110 // as noise for now. 111 ClusterIdForPoint_[P] = ClusterId::noise(); 112 continue; 113 } 114 115 // Create a new cluster, add P. 116 Clusters_.emplace_back(ClusterId::makeValid(Clusters_.size())); 117 Cluster &CurrentCluster = Clusters_.back(); 118 ClusterIdForPoint_[P] = CurrentCluster.Id; /* Label initial point */ 119 CurrentCluster.PointIndices.push_back(P); 120 121 // Process P's neighbors. 122 std::unordered_set<size_t> ToProcess(Neighbors.begin(), Neighbors.end()); 123 while (!ToProcess.empty()) { 124 // Retrieve a point from the set. 125 const size_t Q = *ToProcess.begin(); 126 ToProcess.erase(Q); 127 128 if (ClusterIdForPoint_[Q].isNoise()) { 129 // Change noise point to border point. 130 ClusterIdForPoint_[Q] = CurrentCluster.Id; 131 CurrentCluster.PointIndices.push_back(Q); 132 continue; 133 } 134 if (!ClusterIdForPoint_[Q].isUndef()) { 135 continue; // Previously processed. 136 } 137 // Add Q to the current custer. 138 ClusterIdForPoint_[Q] = CurrentCluster.Id; 139 CurrentCluster.PointIndices.push_back(Q); 140 // And extend to the neighbors of Q if the region is dense enough. 141 const auto Neighbors = rangeQuery(Q); 142 if (Neighbors.size() + 1 >= MinPts) { 143 ToProcess.insert(Neighbors.begin(), Neighbors.end()); 144 } 145 } 146 } 147 148 // Add noisy points to noise cluster. 149 for (size_t P = 0, NumPoints = Points_.size(); P < NumPoints; ++P) { 150 if (ClusterIdForPoint_[P].isNoise()) { 151 NoiseCluster_.PointIndices.push_back(P); 152 } 153 } 154 } 155 156 llvm::Expected<InstructionBenchmarkClustering> 157 InstructionBenchmarkClustering::create( 158 const std::vector<InstructionBenchmark> &Points, const size_t MinPts, 159 const double Epsilon) { 160 InstructionBenchmarkClustering Clustering(Points, Epsilon * Epsilon); 161 if (auto Error = Clustering.validateAndSetup()) { 162 return std::move(Error); 163 } 164 if (Clustering.ErrorCluster_.PointIndices.size() == Points.size()) { 165 return Clustering; // Nothing to cluster. 166 } 167 168 Clustering.dbScan(MinPts); 169 return Clustering; 170 } 171 172 } // namespace exegesis 173