Home | History | Annotate | Download | only in Tensor
      1 // This file is part of Eigen, a lightweight C++ template library
      2 // for linear algebra.
      3 //
      4 // Copyright (C) 2016 Rasmus Munk Larsen <rmlarsen (at) google.com>
      5 //
      6 // This Source Code Form is subject to the terms of the Mozilla
      7 // Public License v. 2.0. If a copy of the MPL was not distributed
      8 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
      9 
     10 #ifndef EIGEN_CXX11_TENSOR_TENSOR_COST_MODEL_H
     11 #define EIGEN_CXX11_TENSOR_TENSOR_COST_MODEL_H
     12 
     13 namespace Eigen {
     14 
     15 /** \class TensorEvaluator
     16   * \ingroup CXX11_Tensor_Module
     17   *
     18   * \brief A cost model used to limit the number of threads used for evaluating
     19   * tensor expression.
     20   *
     21   */
     22 
     23 // Class storing the cost of evaluating a tensor expression in terms of the
     24 // estimated number of operand bytes loads, bytes stored, and compute cycles.
     25 class TensorOpCost {
     26  public:
     27   // TODO(rmlarsen): Fix the scalar op costs in Eigen proper. Even a simple
     28   // model based on minimal reciprocal throughput numbers from Intel or
     29   // Agner Fog's tables would be better than what is there now.
     30   template <typename ArgType>
     31   static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int MulCost() {
     32     return internal::functor_traits<
     33         internal::scalar_product_op<ArgType, ArgType> >::Cost;
     34   }
     35   template <typename ArgType>
     36   static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int AddCost() {
     37     return internal::functor_traits<internal::scalar_sum_op<ArgType> >::Cost;
     38   }
     39   template <typename ArgType>
     40   static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int DivCost() {
     41     return internal::functor_traits<
     42         internal::scalar_quotient_op<ArgType, ArgType> >::Cost;
     43   }
     44   template <typename ArgType>
     45   static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int ModCost() {
     46     return internal::functor_traits<internal::scalar_mod_op<ArgType> >::Cost;
     47   }
     48   template <typename SrcType, typename TargetType>
     49   static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int CastCost() {
     50     return internal::functor_traits<
     51         internal::scalar_cast_op<SrcType, TargetType> >::Cost;
     52   }
     53 
     54   EIGEN_DEVICE_FUNC
     55   TensorOpCost() : bytes_loaded_(0), bytes_stored_(0), compute_cycles_(0) {}
     56   EIGEN_DEVICE_FUNC
     57   TensorOpCost(double bytes_loaded, double bytes_stored, double compute_cycles)
     58       : bytes_loaded_(bytes_loaded),
     59         bytes_stored_(bytes_stored),
     60         compute_cycles_(compute_cycles) {}
     61 
     62   EIGEN_DEVICE_FUNC
     63   TensorOpCost(double bytes_loaded, double bytes_stored, double compute_cycles,
     64                bool vectorized, double packet_size)
     65       : bytes_loaded_(bytes_loaded),
     66         bytes_stored_(bytes_stored),
     67         compute_cycles_(vectorized ? compute_cycles / packet_size
     68                                    : compute_cycles) {
     69     eigen_assert(bytes_loaded >= 0 && (numext::isfinite)(bytes_loaded));
     70     eigen_assert(bytes_stored >= 0 && (numext::isfinite)(bytes_stored));
     71     eigen_assert(compute_cycles >= 0 && (numext::isfinite)(compute_cycles));
     72   }
     73 
     74   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double bytes_loaded() const {
     75     return bytes_loaded_;
     76   }
     77   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double bytes_stored() const {
     78     return bytes_stored_;
     79   }
     80   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double compute_cycles() const {
     81     return compute_cycles_;
     82   }
     83   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double total_cost(
     84       double load_cost, double store_cost, double compute_cost) const {
     85     return load_cost * bytes_loaded_ + store_cost * bytes_stored_ +
     86            compute_cost * compute_cycles_;
     87   }
     88 
     89   // Drop memory access component. Intended for cases when memory accesses are
     90   // sequential or are completely masked by computations.
     91   EIGEN_DEVICE_FUNC void dropMemoryCost() {
     92     bytes_loaded_ = 0;
     93     bytes_stored_ = 0;
     94   }
     95 
     96   // TODO(rmlarsen): Define min in terms of total cost, not elementwise.
     97   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost cwiseMin(
     98       const TensorOpCost& rhs) const {
     99     double bytes_loaded = numext::mini(bytes_loaded_, rhs.bytes_loaded());
    100     double bytes_stored = numext::mini(bytes_stored_, rhs.bytes_stored());
    101     double compute_cycles = numext::mini(compute_cycles_, rhs.compute_cycles());
    102     return TensorOpCost(bytes_loaded, bytes_stored, compute_cycles);
    103   }
    104 
    105   // TODO(rmlarsen): Define max in terms of total cost, not elementwise.
    106   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost cwiseMax(
    107       const TensorOpCost& rhs) const {
    108     double bytes_loaded = numext::maxi(bytes_loaded_, rhs.bytes_loaded());
    109     double bytes_stored = numext::maxi(bytes_stored_, rhs.bytes_stored());
    110     double compute_cycles = numext::maxi(compute_cycles_, rhs.compute_cycles());
    111     return TensorOpCost(bytes_loaded, bytes_stored, compute_cycles);
    112   }
    113 
    114   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost& operator+=(
    115       const TensorOpCost& rhs) {
    116     bytes_loaded_ += rhs.bytes_loaded();
    117     bytes_stored_ += rhs.bytes_stored();
    118     compute_cycles_ += rhs.compute_cycles();
    119     return *this;
    120   }
    121 
    122   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost& operator*=(double rhs) {
    123     bytes_loaded_ *= rhs;
    124     bytes_stored_ *= rhs;
    125     compute_cycles_ *= rhs;
    126     return *this;
    127   }
    128 
    129   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE friend TensorOpCost operator+(
    130       TensorOpCost lhs, const TensorOpCost& rhs) {
    131     lhs += rhs;
    132     return lhs;
    133   }
    134   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE friend TensorOpCost operator*(
    135       TensorOpCost lhs, double rhs) {
    136     lhs *= rhs;
    137     return lhs;
    138   }
    139   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE friend TensorOpCost operator*(
    140       double lhs, TensorOpCost rhs) {
    141     rhs *= lhs;
    142     return rhs;
    143   }
    144 
    145   friend std::ostream& operator<<(std::ostream& os, const TensorOpCost& tc) {
    146     return os << "[bytes_loaded = " << tc.bytes_loaded()
    147               << ", bytes_stored = " << tc.bytes_stored()
    148               << ", compute_cycles = " << tc.compute_cycles() << "]";
    149   }
    150 
    151  private:
    152   double bytes_loaded_;
    153   double bytes_stored_;
    154   double compute_cycles_;
    155 };
    156 
    157 // TODO(rmlarsen): Implement a policy that chooses an "optimal" number of theads
    158 // in [1:max_threads] instead of just switching multi-threading off for small
    159 // work units.
    160 template <typename Device>
    161 class TensorCostModel {
    162  public:
    163   // Scaling from Eigen compute cost to device cycles.
    164   static const int kDeviceCyclesPerComputeCycle = 1;
    165 
    166  // Costs in device cycles.
    167   static const int kStartupCycles = 100000;
    168   static const int kPerThreadCycles = 100000;
    169   static const int kTaskSize = 40000;
    170 
    171   // Returns the number of threads in [1:max_threads] to use for
    172   // evaluating an expression with the given output size and cost per
    173   // coefficient.
    174   static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int numThreads(
    175       double output_size, const TensorOpCost& cost_per_coeff, int max_threads) {
    176     double cost = totalCost(output_size, cost_per_coeff);
    177     int threads = (cost - kStartupCycles) / kPerThreadCycles + 0.9;
    178     return numext::mini(max_threads, numext::maxi(1, threads));
    179   }
    180 
    181   // taskSize assesses parallel task size.
    182   // Value of 1.0 means ideal parallel task size. Values < 1.0 mean that task
    183   // granularity needs to be increased to mitigate parallelization overheads.
    184   static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double taskSize(
    185       double output_size, const TensorOpCost& cost_per_coeff) {
    186     return totalCost(output_size, cost_per_coeff) / kTaskSize;
    187   }
    188 
    189  private:
    190   static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double totalCost(
    191       double output_size, const TensorOpCost& cost_per_coeff) {
    192     // Cost of memory fetches from L2 cache. 64 is typical cache line size.
    193     // 11 is L2 cache latency on Haswell.
    194     // We don't know whether data is in L1, L2 or L3. But we are most interested
    195     // in single-threaded computational time around 100us-10ms (smaller time
    196     // is too small for parallelization, larger time is not intersting
    197     // either because we are probably using all available threads already).
    198     // And for the target time range, L2 seems to be what matters. Data set
    199     // fitting into L1 is too small to take noticeable time. Data set fitting
    200     // only into L3 presumably will take more than 10ms to load and process.
    201     const double kLoadCycles = 1.0 / 64 * 11;
    202     const double kStoreCycles = 1.0 / 64 * 11;
    203     // Scaling from Eigen compute cost to device cycles.
    204     return output_size *
    205         cost_per_coeff.total_cost(kLoadCycles, kStoreCycles,
    206                                   kDeviceCyclesPerComputeCycle);
    207   }
    208 };
    209 
    210 }  // namespace Eigen
    211 
    212 #endif  // EIGEN_CXX11_TENSOR_TENSOR_COST_MODEL_H
    213