Home | History | Annotate | Download | only in lib
      1 //===-- Clustering.cpp ------------------------------------------*- C++ -*-===//
      2 //
      3 //                     The LLVM Compiler Infrastructure
      4 //
      5 // This file is distributed under the University of Illinois Open Source
      6 // License. See LICENSE.TXT for details.
      7 //
      8 //===----------------------------------------------------------------------===//
      9 
     10 #include "Clustering.h"
     11 #include <string>
     12 #include <unordered_set>
     13 
     14 namespace exegesis {
     15 
     16 // The clustering problem has the following characteristics:
     17 //  (A) - Low dimension (dimensions are typically proc resource units,
     18 //    typically < 10).
     19 //  (B) - Number of points : ~thousands (points are measurements of an MCInst)
     20 //  (C) - Number of clusters: ~tens.
     21 //  (D) - The number of clusters is not known /a priory/.
     22 //  (E) - The amount of noise is relatively small.
     23 // The problem is rather small. In terms of algorithms, (D) disqualifies
     24 // k-means and makes algorithms such as DBSCAN[1] or OPTICS[2] more applicable.
     25 //
     26 // We've used DBSCAN here because it's simple to implement. This is a pretty
     27 // straightforward and inefficient implementation of the pseudocode in [2].
     28 //
     29 // [1] https://en.wikipedia.org/wiki/DBSCAN
     30 // [2] https://en.wikipedia.org/wiki/OPTICS_algorithm
     31 
     32 // Finds the points at distance less than sqrt(EpsilonSquared) of Q (not
     33 // including Q).
     34 std::vector<size_t>
     35 InstructionBenchmarkClustering::rangeQuery(const size_t Q) const {
     36   std::vector<size_t> Neighbors;
     37   const auto &QMeasurements = Points_[Q].Measurements;
     38   for (size_t P = 0, NumPoints = Points_.size(); P < NumPoints; ++P) {
     39     if (P == Q)
     40       continue;
     41     const auto &PMeasurements = Points_[P].Measurements;
     42     if (PMeasurements.empty()) // Error point.
     43       continue;
     44     if (isNeighbour(PMeasurements, QMeasurements)) {
     45       Neighbors.push_back(P);
     46     }
     47   }
     48   return Neighbors;
     49 }
     50 
     51 bool InstructionBenchmarkClustering::isNeighbour(
     52     const std::vector<BenchmarkMeasure> &P,
     53     const std::vector<BenchmarkMeasure> &Q) const {
     54   double DistanceSquared = 0.0;
     55   for (size_t I = 0, E = P.size(); I < E; ++I) {
     56     const auto Diff = P[I].Value - Q[I].Value;
     57     DistanceSquared += Diff * Diff;
     58   }
     59   return DistanceSquared <= EpsilonSquared_;
     60 }
     61 
     62 InstructionBenchmarkClustering::InstructionBenchmarkClustering(
     63     const std::vector<InstructionBenchmark> &Points,
     64     const double EpsilonSquared)
     65     : Points_(Points), EpsilonSquared_(EpsilonSquared),
     66       NoiseCluster_(ClusterId::noise()), ErrorCluster_(ClusterId::error()) {}
     67 
     68 llvm::Error InstructionBenchmarkClustering::validateAndSetup() {
     69   ClusterIdForPoint_.resize(Points_.size());
     70   // Mark erroneous measurements out.
     71   // All points must have the same number of dimensions, in the same order.
     72   const std::vector<BenchmarkMeasure> *LastMeasurement = nullptr;
     73   for (size_t P = 0, NumPoints = Points_.size(); P < NumPoints; ++P) {
     74     const auto &Point = Points_[P];
     75     if (!Point.Error.empty()) {
     76       ClusterIdForPoint_[P] = ClusterId::error();
     77       ErrorCluster_.PointIndices.push_back(P);
     78       continue;
     79     }
     80     const auto *CurMeasurement = &Point.Measurements;
     81     if (LastMeasurement) {
     82       if (LastMeasurement->size() != CurMeasurement->size()) {
     83         return llvm::make_error<llvm::StringError>(
     84             "inconsistent measurement dimensions",
     85             llvm::inconvertibleErrorCode());
     86       }
     87       for (size_t I = 0, E = LastMeasurement->size(); I < E; ++I) {
     88         if (LastMeasurement->at(I).Key != CurMeasurement->at(I).Key) {
     89           return llvm::make_error<llvm::StringError>(
     90               "inconsistent measurement dimensions keys",
     91               llvm::inconvertibleErrorCode());
     92         }
     93       }
     94     }
     95     LastMeasurement = CurMeasurement;
     96   }
     97   if (LastMeasurement) {
     98     NumDimensions_ = LastMeasurement->size();
     99   }
    100   return llvm::Error::success();
    101 }
    102 
    103 void InstructionBenchmarkClustering::dbScan(const size_t MinPts) {
    104   for (size_t P = 0, NumPoints = Points_.size(); P < NumPoints; ++P) {
    105     if (!ClusterIdForPoint_[P].isUndef())
    106       continue; // Previously processed in inner loop.
    107     const auto Neighbors = rangeQuery(P);
    108     if (Neighbors.size() + 1 < MinPts) { // Density check.
    109       // The region around P is not dense enough to create a new cluster, mark
    110       // as noise for now.
    111       ClusterIdForPoint_[P] = ClusterId::noise();
    112       continue;
    113     }
    114 
    115     // Create a new cluster, add P.
    116     Clusters_.emplace_back(ClusterId::makeValid(Clusters_.size()));
    117     Cluster &CurrentCluster = Clusters_.back();
    118     ClusterIdForPoint_[P] = CurrentCluster.Id; /* Label initial point */
    119     CurrentCluster.PointIndices.push_back(P);
    120 
    121     // Process P's neighbors.
    122     std::unordered_set<size_t> ToProcess(Neighbors.begin(), Neighbors.end());
    123     while (!ToProcess.empty()) {
    124       // Retrieve a point from the set.
    125       const size_t Q = *ToProcess.begin();
    126       ToProcess.erase(Q);
    127 
    128       if (ClusterIdForPoint_[Q].isNoise()) {
    129         // Change noise point to border point.
    130         ClusterIdForPoint_[Q] = CurrentCluster.Id;
    131         CurrentCluster.PointIndices.push_back(Q);
    132         continue;
    133       }
    134       if (!ClusterIdForPoint_[Q].isUndef()) {
    135         continue; // Previously processed.
    136       }
    137       // Add Q to the current custer.
    138       ClusterIdForPoint_[Q] = CurrentCluster.Id;
    139       CurrentCluster.PointIndices.push_back(Q);
    140       // And extend to the neighbors of Q if the region is dense enough.
    141       const auto Neighbors = rangeQuery(Q);
    142       if (Neighbors.size() + 1 >= MinPts) {
    143         ToProcess.insert(Neighbors.begin(), Neighbors.end());
    144       }
    145     }
    146   }
    147 
    148   // Add noisy points to noise cluster.
    149   for (size_t P = 0, NumPoints = Points_.size(); P < NumPoints; ++P) {
    150     if (ClusterIdForPoint_[P].isNoise()) {
    151       NoiseCluster_.PointIndices.push_back(P);
    152     }
    153   }
    154 }
    155 
    156 llvm::Expected<InstructionBenchmarkClustering>
    157 InstructionBenchmarkClustering::create(
    158     const std::vector<InstructionBenchmark> &Points, const size_t MinPts,
    159     const double Epsilon) {
    160   InstructionBenchmarkClustering Clustering(Points, Epsilon * Epsilon);
    161   if (auto Error = Clustering.validateAndSetup()) {
    162     return std::move(Error);
    163   }
    164   if (Clustering.ErrorCluster_.PointIndices.size() == Points.size()) {
    165     return Clustering; // Nothing to cluster.
    166   }
    167 
    168   Clustering.dbScan(MinPts);
    169   return Clustering;
    170 }
    171 
    172 } // namespace exegesis
    173