src/Tensor/TensorContractionThreadPool.h

// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog (at) gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.

#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_THREAD_POOL_H
#define EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_THREAD_POOL_H

// evaluator for thread pool device
#ifdef EIGEN_USE_THREADS

namespace Eigen {

#ifdef EIGEN_USE_SIMPLE_THREAD_POOL
namespace internal {

template<typename LhsScalar, typename LhsMapper, typename Index>
struct packLhsArg {
  LhsScalar* blockA;
  const LhsMapper& lhs;
  const Index m_start;
  const Index k_start;
  const Index mc;
  const Index kc;
};

template<typename LhsScalar, typename RhsScalar, typename RhsMapper, typename OutputMapper, typename Index>
struct packRhsAndKernelArg {
  const MaxSizeVector<LhsScalar*>* blockAs;
  RhsScalar* blockB;
  const RhsMapper& rhs;
  OutputMapper& output;
  const Index m;
  const Index k;
  const Index n;
  const Index mc;
  const Index kc;
  const Index nc;
  const Index num_threads;
  const Index num_blockAs;
  const Index max_m;
  const Index k_block_idx;
  const Index m_block_idx;
  const Index n_block_idx;
  const Index m_blocks;
  const Index n_blocks;
  MaxSizeVector<Notification*>* kernel_notifications;
  const MaxSizeVector<Notification*>* lhs_notifications;
  const bool need_to_pack;
};

}  // end namespace internal
#endif  // EIGEN_USE_SIMPLE_THREAD_POOL

template<typename Indices, typename LeftArgType, typename RightArgType>
struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType>, ThreadPoolDevice> :
    public TensorContractionEvaluatorBase<TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType>, ThreadPoolDevice> > {

  typedef ThreadPoolDevice Device;

  typedef TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType>, Device> Self;
  typedef TensorContractionEvaluatorBase<Self> Base;

  typedef TensorContractionOp<Indices, LeftArgType, RightArgType> XprType;
  typedef typename internal::remove_const<typename XprType::Scalar>::type Scalar;
  typedef typename XprType::Index Index;
  typedef typename XprType::CoeffReturnType CoeffReturnType;
  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;

  enum {
    Layout = TensorEvaluator<LeftArgType, Device>::Layout,
  };

  // Most of the code is assuming that both input tensors are ColMajor. If the
  // inputs are RowMajor, we will "cheat" by swapping the LHS and RHS:
  // If we want to compute A * B = C, where A is LHS and B is RHS, the code
  // will pretend B is LHS and A is RHS.
  typedef typename internal::conditional<
    static_cast<int>(Layout) == static_cast<int>(ColMajor), LeftArgType, RightArgType>::type EvalLeftArgType;
  typedef typename internal::conditional<
    static_cast<int>(Layout) == static_cast<int>(ColMajor), RightArgType, LeftArgType>::type EvalRightArgType;

  static const int LDims =
      internal::array_size<typename TensorEvaluator<EvalLeftArgType, Device>::Dimensions>::value;
  static const int RDims =
      internal::array_size<typename TensorEvaluator<EvalRightArgType, Device>::Dimensions>::value;
  static const int ContractDims = internal::array_size<Indices>::value;

  typedef array<Index, LDims> left_dim_mapper_t;
  typedef array<Index, RDims> right_dim_mapper_t;

  typedef array<Index, ContractDims> contract_t;
  typedef array<Index, LDims - ContractDims> left_nocontract_t;
  typedef array<Index, RDims - ContractDims> right_nocontract_t;

  static const int NumDims = LDims + RDims - 2 * ContractDims;

  typedef DSizes<Index, NumDims> Dimensions;

  // typedefs needed in evalTo
  typedef typename internal::remove_const<typename EvalLeftArgType::Scalar>::type LhsScalar;
  typedef typename internal::remove_const<typename EvalRightArgType::Scalar>::type RhsScalar;
  typedef typename internal::gebp_traits<LhsScalar, RhsScalar> Traits;

  typedef TensorEvaluator<EvalLeftArgType, Device> LeftEvaluator;
  typedef TensorEvaluator<EvalRightArgType, Device> RightEvaluator;

  TensorEvaluator(const XprType& op, const Device& device) :
      Base(op, device) {}

#ifndef EIGEN_USE_SIMPLE_THREAD_POOL
  template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous,
            bool rhs_inner_dim_reordered, int Alignment>
  void evalProduct(Scalar* buffer) const {
    typedef
        typename internal::remove_const<typename EvalLeftArgType::Scalar>::type
            LhsScalar;
    typedef
        typename internal::remove_const<typename EvalRightArgType::Scalar>::type
            RhsScalar;
    typedef typename internal::gebp_traits<LhsScalar, RhsScalar> Traits;
    typedef TensorEvaluator<EvalLeftArgType, Device> LeftEvaluator;
    typedef TensorEvaluator<EvalRightArgType, Device> RightEvaluator;
    typedef internal::TensorContractionInputMapper<
        LhsScalar, Index, internal::Lhs, LeftEvaluator, left_nocontract_t,
        contract_t, internal::packet_traits<LhsScalar>::size,
        lhs_inner_dim_contiguous, false, Unaligned>
        LhsMapper;
    typedef internal::TensorContractionInputMapper<
        RhsScalar, Index, internal::Rhs, RightEvaluator, right_nocontract_t,
        contract_t, internal::packet_traits<RhsScalar>::size,
        rhs_inner_dim_contiguous, rhs_inner_dim_reordered, Unaligned>
        RhsMapper;
    typedef internal::blas_data_mapper<Scalar, Index, ColMajor> OutputMapper;
    typedef internal::gemm_pack_lhs<LhsScalar, Index,
                                    typename LhsMapper::SubMapper, Traits::mr,
                                    Traits::LhsProgress, ColMajor>
        LhsPacker;
    typedef internal::gemm_pack_rhs<
        RhsScalar, Index, typename RhsMapper::SubMapper, Traits::nr, ColMajor>
        RhsPacker;
    typedef internal::gebp_kernel<LhsScalar, RhsScalar, Index, OutputMapper,
                                  Traits::mr, Traits::nr, false, false>
        GebpKernel;

    const Index m = this->m_i_size;
    const Index n = this->m_j_size;
    const Index k = this->m_k_size;
    if (m == 0 || n == 0 || k == 0) return;

    // Compute a set of algorithm parameters:
    // - kernel block sizes (bm, bn, bk)
    // - task grain sizes (number of kernels executed per task: gm, gn)
    // - number of threads
    // - sharding by row/column
    // - parallel packing or first lhs then rhs
    // and some derived parameters:
    // - number of tasks (nm, nn, nk)
    // - number of kernels (nm0, nn0)
    // Unfortunately, all these parameters are tightly interdependent.
    // So in some cases we first compute approximate values, then compute other
    // values based on these approximations and then refine the approximations.

    // There are lots of heuristics here. There is some reasoning behind them,
    // but ultimately they are just tuned on contraction benchmarks for
    // different input configurations, thread counts and instruction sets.
    // So feel free to question any of them.

    // Compute whether we want to shard by row or by column.
    // This is a first approximation, it will be refined later. Since we don't
    // know number of threads yet we use 2, because what's we are most
    // interested in at this point is whether it makes sense to use
    // parallelization at all or not.
    bool shard_by_col = shardByCol(m, n, 2);

    // First approximation of kernel blocking sizes.
    // Again, we don't know number of threads yet, so we use 2.
    Index bm, bn, bk;
    if (shard_by_col) {
      internal::TensorContractionBlocking<LhsMapper, RhsMapper, Index,
                                          internal::ShardByCol>
          blocking(k, m, n, 2);
      bm = blocking.mc();
      bn = blocking.nc();
      bk = blocking.kc();
    } else {
      internal::TensorContractionBlocking<LhsMapper, RhsMapper, Index,
                                          internal::ShardByRow>
          blocking(k, m, n, 2);
      bm = blocking.mc();
      bn = blocking.nc();
      bk = blocking.kc();
    }

    // Compute optimal number of threads.
    // Note: we use bk instead of k here because we are interested in amount of
    // _parallelizable_ computations, and computations are not parallelizable
    // across k dimension.
    const TensorOpCost cost =
        contractionCost(m, n, bm, bn, bk, shard_by_col, false);
    int num_threads = TensorCostModel<ThreadPoolDevice>::numThreads(
        static_cast<double>(n) * m, cost, this->m_device.numThreads());

    // TODO(dvyukov): this is a stop-gap to prevent regressions while the cost
    // model is not tuned. Remove this when the cost model is tuned.
    if (n == 1) num_threads = 1;

    if (num_threads == 1) {
      // The single-threaded algorithm should be faster in this case.
      if (n == 1)
        this->template evalGemv<lhs_inner_dim_contiguous,
                                rhs_inner_dim_contiguous,
                                rhs_inner_dim_reordered, Alignment>(buffer);
      else
        this->template evalGemm<lhs_inner_dim_contiguous,
                                rhs_inner_dim_contiguous,
                                rhs_inner_dim_reordered, Alignment>(buffer);
      return;
    }

    // Now that we know number of threads, recalculate sharding and blocking.
    shard_by_col = shardByCol(m, n, num_threads);
    if (shard_by_col) {
      internal::TensorContractionBlocking<LhsMapper, RhsMapper, Index,
                                          internal::ShardByCol>
          blocking(k, m, n, num_threads);
      bm = blocking.mc();
      bn = blocking.nc();
      bk = blocking.kc();
    } else {
      internal::TensorContractionBlocking<LhsMapper, RhsMapper, Index,
                                          internal::ShardByRow>
          blocking(k, m, n, num_threads);
      bm = blocking.mc();
      bn = blocking.nc();
      bk = blocking.kc();
    }

    // Number of kernels for each dimension.
    Index nm0 = divup(m, bm);
    Index nn0 = divup(n, bn);
    Index nk = divup(k, bk);

    // Calculate task grain size (number of kernels executed per task).
    // This task size coarsening serves two purposes:
    // 1. It reduces per-task overheads including synchronization overheads.
    // 2. It allows to use caches better (reuse the same packed rhs in several
    // consecutive kernels).
    Index gm = 1;
    Index gn = 1;
    // If we are sharding by column, then we prefer to reduce rows first.
    if (shard_by_col) {
      gm = coarsenM(m, n, bm, bn, bk, gn, num_threads, shard_by_col);
      gn = coarsenN(m, n, bm, bn, bk, gm, num_threads, shard_by_col);
    } else {
      gn = coarsenN(m, n, bm, bn, bk, gm, num_threads, shard_by_col);
      gm = coarsenM(m, n, bm, bn, bk, gn, num_threads, shard_by_col);
    }
    // Number of tasks in each dimension.
    Index nm = divup(nm0, gm);
    Index nn = divup(nn0, gn);

    // Last by not least, decide whether we want to issue both lhs and rhs
    // packing in parallel; or issue lhs packing first, and then issue rhs
    // packing when lhs packing completes (for !shard_by_col lhs and rhs are
    // swapped). Parallel packing allows more parallelism (for both packing and
    // kernels), while sequential packing provides better locality (once
    // a thread finishes rhs packing it proceed to kernels with that rhs).
    // First, we are interested in parallel packing if there are few tasks.
    bool parallel_pack = num_threads >= nm * nn;
    // Also do parallel packing if all data fits into L2$.
    if (m * bk * Index(sizeof(LhsScalar)) + n * bk * Index(sizeof(RhsScalar)) <=
        l2CacheSize() * num_threads)
      parallel_pack = true;
    // But don't do it if we will use each rhs only once. Locality seems to be
    // more important in this case.
    if ((shard_by_col ? nm : nn) == 1) parallel_pack = false;

    LhsMapper lhs(this->m_leftImpl, this->m_left_nocontract_strides,
                  this->m_i_strides, this->m_left_contracting_strides,
                  this->m_k_strides);

    RhsMapper rhs(this->m_rightImpl, this->m_right_nocontract_strides,
                  this->m_j_strides, this->m_right_contracting_strides,
                  this->m_k_strides);

    Context<LhsPacker, RhsPacker, GebpKernel, LhsMapper, RhsMapper,
            OutputMapper>(this->m_device, num_threads, lhs, rhs, buffer, m, n,
                          k, bm, bn, bk, nm, nn, nk, gm, gn, nm0, nn0,
                          shard_by_col, parallel_pack)
        .run();
  }

  // Context coordinates a single parallel gemm operation.
  template <typename LhsPacker, typename RhsPacker, typename GebpKernel,
            typename LhsMapper, typename RhsMapper, typename OutputMapper>
  class Context {
   public:
    Context(const Device& device, int num_threads, LhsMapper& lhs,
            RhsMapper& rhs, Scalar* buffer, Index tm, Index tn, Index tk, Index bm,
            Index bn, Index bk, Index nm, Index nn, Index nk, Index gm,
            Index gn, Index nm0, Index nn0, bool shard_by_col,
            bool parallel_pack)
        : device_(device),
          lhs_(lhs),
          rhs_(rhs),
          buffer_(buffer),
          output_(buffer, tm),
          num_threads_(num_threads),
          shard_by_col_(shard_by_col),
          parallel_pack_(parallel_pack),
          m_(tm),
          n_(tn),
          k_(tk),
          bm_(bm),
          bn_(bn),
          bk_(bk),
          nm_(nm),
          nn_(nn),
          nk_(nk),
          gm_(gm),
          gn_(gn),
          nm0_(nm0),
          nn0_(nn0)
  {
      for (Index x = 0; x < P; x++) {
        // Normal number of notifications for k slice switch is
        // nm_ + nn_ + nm_ * nn_. However, first P - 1 slices will receive only
        // nm_ + nn_ notifications, because they will not receive notifications
        // from preceeding kernels.
        state_switch_[x] =
            x == 0
                ? 1
                : (parallel_pack_ ? nn_ + nm_ : (shard_by_col_ ? nn_ : nm_)) +
                      (x == P - 1 ? nm_ * nn_ : 0);
        state_packing_ready_[x] =
            parallel_pack_ ? 0 : (shard_by_col_ ? nm_ : nn_);
        state_kernel_[x] = new std::atomic<uint8_t>*[nm_];
        for (Index m = 0; m < nm_; m++) {
          state_kernel_[x][m] = new std::atomic<uint8_t>[nn_];
          // Kernels generally receive 3 notifications (previous kernel + 2
          // packing), but the first slice won't get notifications from previous
          // kernels.
          for (Index n = 0; n < nn_; n++)
            state_kernel_[x][m][n].store(
                (x == 0 ? 0 : 1) + (parallel_pack_ ? 2 : 1),
                std::memory_order_relaxed);
        }
      }

      // Allocate memory for packed rhs/lhs matrices.
      size_t align = numext::maxi(EIGEN_MAX_ALIGN_BYTES, 1);
      size_t lhs_size =
          divup<size_t>(bm_ * bk_ * sizeof(LhsScalar), align) * align;
      size_t rhs_size =
          divup<size_t>(bn_ * bk_ * sizeof(RhsScalar), align) * align;
      packed_mem_ = static_cast<char*>(internal::aligned_malloc(
          (nm0_ * lhs_size + nn0_ * rhs_size) * std::min<size_t>(nk_, P - 1)));
      char* mem = static_cast<char*>(packed_mem_);
      for (Index x = 0; x < numext::mini<Index>(nk_, P - 1); x++) {
        packed_lhs_[x].resize(nm0_);
        for (Index m = 0; m < nm0_; m++) {
          packed_lhs_[x][m] = reinterpret_cast<LhsScalar*>(mem);
          mem += lhs_size;
        }
        packed_rhs_[x].resize(nn0_);
        for (Index n = 0; n < nn0_; n++) {
          packed_rhs_[x][n] = reinterpret_cast<RhsScalar*>(mem);
          mem += rhs_size;
        }
      }
    }

    ~Context() {
      for (Index x = 0; x < P; x++) {
        for (Index m = 0; m < nm_; m++) delete[] state_kernel_[x][m];
        delete[] state_kernel_[x];
      }
      internal::aligned_free(packed_mem_);
    }

    void run() {
      // Kick off packing of the first slice.
      signal_switch(0, 1);
      // Wait for overall completion.
      // TODO(dvyukov): this wait can lead to deadlock.
      // If nthreads contractions are concurrently submitted from worker
      // threads, this wait will block all worker threads and the system will
      // deadlock.
      done_.Wait();
    }

   private:
    Notification done_;
    const Device& device_;
    LhsMapper& lhs_;
    RhsMapper& rhs_;
    Scalar* const buffer_;
    OutputMapper output_;
    const int num_threads_;
    const bool shard_by_col_;
    const bool parallel_pack_;
    // Matrix sizes.
    const Index m_;
    const Index n_;
    const Index k_;
    // Block sizes.
    const Index bm_;
    const Index bn_;
    const Index bk_;
    // Number of tasks.
    const Index nm_;
    const Index nn_;
    const Index nk_;
    // Task grain sizes (number of kernels executed per task).
    const Index gm_;
    const Index gn_;
    // Number of blocks (this is different from ni_/nn_ because of task size
    // coarsening).
    const Index nm0_;
    const Index nn0_;

    // Parallelization strategy.
    //
    // Blocks related to the same k block can run in parallel because they write
    // to different output blocks. So we parallelize within k slices, this
    // gives us parallelism level of m x n. Before we can start any kernels
    // related to k-th slice, we need to issue m lhs packing tasks and n rhs
    // packing tasks.
    //
    // However, there is a bottleneck when we are finishing kernels for k-th
    // slice (at the very end there is only 1 runnable kernel). To mitigate this
    // bottleneck we allow kernels from k-th and k+1-th slices to run in
    // parallel. Note that (m, n, k) and (m, n, k+1) kernels write to the same
    // output block, so they must not run in parallel.
    //
    // This gives us the following dependency graph.
    // On each k slice we have m x n kernel tasks, m lhs paking tasks and n rhs
    // packing tasks.
    // Kernel (m, n, k) can start when:
    //  - kernel (m, n, k-1) has finished
    //  - lhs packing (m, k) has finished
    //  - rhs packing (n, k) has finished
    // Lhs/rhs packing can start when:
    //  - all k-1 packing has finished (artificially imposed to limit amount of
    //  parallel packing)
    //
    // On top of that we limit runnable tasks to two consecutive k slices.
    // This is done to limit amount of memory we need for packed lhs/rhs
    // (for each k slice we need m*bk + n*bk memory in packed_lhs_/packed_rhs_).
    //
    // state_switch_ tracks when we are ready to switch to the next k slice.
    // state_kernel_[m][n] tracks when we are ready to kick off kernel (m, n).
    // These variable are rolling over 3 consecutive k slices: first two we are
    // actively executing + one to track completion of kernels in the second
    // slice.
    static const Index P = 3;
    void* packed_mem_;
    std::vector<LhsScalar*> packed_lhs_[P - 1];
    std::vector<RhsScalar*> packed_rhs_[P - 1];
    std::atomic<uint8_t>** state_kernel_[P];
    // state_switch_ is frequently modified by worker threads, while other
    // fields are read-only after constructor. Let's move it to a separate cache
    // line to reduce cache-coherency traffic.
    char pad_[128];
    std::atomic<Index> state_packing_ready_[P];
    std::atomic<Index> state_switch_[P];

    void pack_lhs(Index m, Index k) {
      const Index mend = m * gm_ + gm(m);
      for (Index m1 = m * gm_; m1 < mend; m1++)
        LhsPacker()(packed_lhs_[k % (P - 1)][m1],
                    lhs_.getSubMapper(m1 * bm_, k * bk_), bk(k), bm(m1));

      if (!parallel_pack_ && shard_by_col_) {
        signal_packing(k);
      } else {
        signal_switch(k + 1);
        for (Index n = nn_ - 1; n >= 0; n--) signal_kernel(m, n, k, n == 0);
      }
    }

    void pack_rhs(Index n, Index k) {
      const Index nend = n * gn_ + gn(n);
      for (Index n1 = n * gn_; n1 < nend; n1++) {
        if (k == 0) {
          // Zero the output memory in parallel.
          // On 10000x2x10000 mm zeroing can easily take half of time.
          // Zero (bn x m) row. Safe to do here because all kernels that will
          // write to this memory depend on completion of this task.
          // Note: don't call device_.memset() here. device_.memset() blocks on
          // thread pool worker thread, which can lead to underutilization and
          // deadlocks.
          memset(buffer_ + n1 * bn_ * m_, 0, bn(n1) * m_ * sizeof(Scalar));
        }
        RhsPacker()(packed_rhs_[k % (P - 1)][n1],
                    rhs_.getSubMapper(k * bk_, n1 * bn_), bk(k), bn(n1));
      }

      if (parallel_pack_ || shard_by_col_) {
        signal_switch(k + 1);
        for (Index m = nm_ - 1; m >= 0; m--) signal_kernel(m, n, k, m == 0);
      } else {
        signal_packing(k);
      }
    }

    void kernel(Index m, Index n, Index k) {
      // Note: order of iteration matters here. Iteration over m is innermost
      // because we want to reuse the same packed rhs in consequetive tasks
      // (rhs fits into L2$ while lhs only into L3$).
      const Index nend = n * gn_ + gn(n);
      const Index mend = m * gm_ + gm(m);
      if (shard_by_col_) {
        for (Index n1 = n * gn_; n1 < nend; n1++) {
          for (Index m1 = m * gm_; m1 < mend; m1++)
            GebpKernel()(output_.getSubMapper(m1 * bm_, n1 * bn_),
                         packed_lhs_[k % (P - 1)][m1],
                         packed_rhs_[k % (P - 1)][n1], bm(m1), bk(k), bn(n1),
                         Scalar(1), -1, -1, 0, 0);
        }
      } else {
        for (Index m1 = m * gm_; m1 < mend; m1++)
          for (Index n1 = n * gn_; n1 < nend; n1++) {
            GebpKernel()(output_.getSubMapper(m1 * bm_, n1 * bn_),
                         packed_lhs_[k % (P - 1)][m1],
                         packed_rhs_[k % (P - 1)][n1], bm(m1), bk(k), bn(n1),
                         Scalar(1), -1, -1, 0, 0);
          }
      }
      signal_kernel(m, n, k + 1, false);
      signal_switch(k + 2);
    }

    void signal_packing(Index k) {
      eigen_assert(!parallel_pack_);
      Index s = state_packing_ready_[k % P].fetch_sub(1);
      eigen_assert(s > 0);
      if (s != 1) return;
      state_packing_ready_[k % P] = shard_by_col_ ? nm_ : nn_;
      enqueue_packing(k, shard_by_col_);
    }

    void signal_kernel(Index m, Index n, Index k, bool sync) {
      std::atomic<uint8_t>* state = &state_kernel_[k % P][m][n];
      Index s = state->load();
      eigen_assert(s > 0);
      if (s != 1 && state->fetch_sub(1) != 1) return;
      state->store(parallel_pack_ ? 3 : 2, std::memory_order_relaxed);
      if (sync)
        kernel(m, n, k);
      else
        device_.enqueueNoNotification([=]() { kernel(m, n, k); });
    }

    void signal_switch(Index k, Index v = 1) {
      Index s = state_switch_[k % P].fetch_sub(v);
      eigen_assert(s >= v);
      if (s != v) return;

      // Ready to switch to the next k slice.
      // Reset counter for the next iteration.
      state_switch_[k % P] =
          (parallel_pack_ ? nm_ + nn_ : (shard_by_col_ ? nn_ : nm_)) +
          nm_ * nn_;
      if (k < nk_) {
        // Issue lhs/rhs packing. Their completion will in turn kick off
        // kernels.
        if (parallel_pack_) {
          enqueue_packing(k, !shard_by_col_);
          enqueue_packing(k, shard_by_col_);
        } else if (shard_by_col_) {
          enqueue_packing(k, false);
        } else {
          enqueue_packing(k, true);
        }

        // Termination handling.
        // Because kernel completion signals k + 2 switch, we need to finish nk
        // + 2 slices without issuing any tasks on nk + 1 slice. So here we
        // pretend that all nk + 1 packing tasks just finish instantly; so that
        // nk + 2 switch only waits for completion of nk kernels.
      } else if (k == nk_) {
        signal_switch(k + 1,
                      parallel_pack_ ? nm_ + nn_ : (shard_by_col_ ? nn_ : nm_));
      } else {
        done_.Notify();
      }
    }

    // Enqueue all rhs/lhs packing for k-th slice.
    void enqueue_packing(Index k, bool rhs) {
      enqueue_packing_helper(0, rhs ? nn_ : nm_, k, rhs);
    }

    void enqueue_packing_helper(Index start, Index end, Index k, bool rhs) {
      if (end - start == 1) {
        if (rhs)
          pack_rhs(start, k);
        else
          pack_lhs(start, k);
      } else {
        Index mid = (start + end) / 2;
        device_.enqueueNoNotification(
            [=]() { enqueue_packing_helper(mid, end, k, rhs); });
        device_.enqueueNoNotification(
            [=]() { enqueue_packing_helper(start, mid, k, rhs); });
      }
    }

    // Block sizes with accounting for potentially incomplete last block.
    Index bm(Index m) const { return m + 1 < nm0_ ? bm_ : m_ + bm_ - bm_ * nm0_; }
    Index bn(Index n) const { return n + 1 < nn0_ ? bn_ : n_ + bn_ - bn_ * nn0_; }
    Index bk(Index k) const { return k + 1 < nk_ ? bk_ : k_ + bk_ - bk_ * nk_; }
    // Task grain sizes accounting for potentially incomplete last task.
    Index gm(Index m) const { return m + 1 < nm_ ? gm_ : nm0_ + gm_ - gm_ * nm_; }
    Index gn(Index n) const { return n + 1 < nn_ ? gn_ : nn0_ + gn_ - gn_ * nn_; }

    Context(const Context&) = delete;
    void operator=(const Context&) = delete;
  };

  // Decide whether we want to shard m x n contraction by columns or by rows.
  static bool shardByCol(Index m, Index n, Index num_threads) {
    // Note: we are comparing both n and m against Traits::nr, it is not
    // a mistake. We are trying to figure out how both n and m will fit into
    // the main sharding dimension.

    // Sharding by column is the default
    // ... unless there is enough data for vectorization over rows
    if (m / num_threads >= Traits::nr &&
        // and not enough data for vectorization over columns
        (n / num_threads < Traits::nr ||
         // ... or barely enough data for vectorization over columns,
         // but it is not evenly dividable across threads
         (n / num_threads < 4 * Traits::nr &&
          (n % (num_threads * Traits::nr)) != 0 &&
          // ... and it is evenly dividable across threads for rows
          ((m % (num_threads * Traits::nr)) == 0 ||
           // .. or it is not evenly dividable for both dimensions but
           // there is much more data over rows so that corner effects are
           // mitigated.
           (m / n >= 6)))))
      return false;
    // Wait, or if matrices are just substantially prolonged over the other
    // dimension.
    if (n / num_threads < 16 * Traits::nr && m > n * 32) return false;
    return true;
  }

  Index coarsenM(Index m, Index n, Index bm, Index bn, Index bk, Index gn,
                 int num_threads, bool shard_by_col) const {
    Index gm = 1;
    Index gm1 = 1;
    Index nm0 = divup(m, bm);
    Index nm1 = nm0;
    for (;;) {
      // Find the next candidate for m grain size. It needs to result in
      // different number of blocks. E.g. if we have 10 kernels, we want to try
      // 5 and 10, but not 6, 7, 8 and 9.
      while (gm1 <= nm0 && nm1 == divup(nm0, gm1)) gm1++;
      if (gm1 > nm0) break;
      // Check the candidate.
      int res = checkGrain(m, n, bm, bn, bk, gm1, gn, gm, gn, num_threads,
                           shard_by_col);
      if (res < 0) break;
      nm1 = divup(nm0, gm1);
      if (res == 0) continue;
      // Commit new grain size.
      gm = gm1;
    }
    return gm;
  }

  Index coarsenN(Index m, Index n, Index bm, Index bn, Index bk, Index gm,
                 int num_threads, bool shard_by_col) const {
    Index gn = 1;
    Index gn1 = 1;
    Index nn0 = divup(n, bn);
    Index nn1 = nn0;
    for (;;) {
      while (gn1 <= nn0 && nn1 == divup(nn0, gn1)) gn1++;
      if (gn1 > nn0) break;
      int res = checkGrain(m, n, bm, bn, bk, gm, gn1, gm, gn, num_threads,
                           shard_by_col);
      if (res < 0) break;
      nn1 = divup(nn0, gn1);
      if (res == 0) continue;
      gn = gn1;
    }
    return gn;
  }

  // checkGrain checks whether grain (gm, gn) is suitable and is better than
  // (oldgm, oldgn).
  int checkGrain(Index m, Index n, Index bm, Index bn, Index bk, Index gm,
                 Index gn, Index oldgm, Index oldgn, int num_threads,
                 bool shard_by_col) const {
    const TensorOpCost cost =
        contractionCost(bm * gm, bn * gn, bm, bn, bk, shard_by_col, true);
    double taskSize = TensorCostModel<ThreadPoolDevice>::taskSize(
        static_cast<double>(bm) * gm * bn * gn, cost);
    // If the task is too small, then we agree on it regardless of anything
    // else. Otherwise synchronization overheads will dominate.
    if (taskSize < 1) return 1;
    // If it is too large, then we reject it and all larger tasks.
    if (taskSize > 2) return -1;
    // Now we are in presumably good task size range.
    // The main deciding factor here is parallelism. Consider that we have 12
    // kernels and 4 threads. Grains of 2, 3 and 4 all yield good task sizes.
    // But 2/4 yield 6/3 tasks, which gives us parallelism of 0.75 (at most 3/4
    // of cores will be busy). While grain size 3 gives us 4 tasks, which gives
    // us parallelism of 1 (we can load all cores).
    Index nm0 = divup(m, bm);
    Index nn0 = divup(n, bn);
    Index new_tasks = divup(nm0, gm) * divup(nn0, gn);
    double new_parallelism = static_cast<double>(new_tasks) /
                             (divup<int>(new_tasks, num_threads) * num_threads);
    Index old_tasks = divup(nm0, oldgm) * divup(nn0, oldgn);
    double old_parallelism = static_cast<double>(old_tasks) /
                             (divup<int>(old_tasks, num_threads) * num_threads);
    if (new_parallelism > old_parallelism || new_parallelism == 1) return 1;
    return 0;
  }

#else  // EIGEN_USE_SIMPLE_THREAD_POOL

  template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered, int Alignment>
  void evalProduct(Scalar* buffer) const {
    if (this->m_j_size == 1) {
      this->template evalGemv<lhs_inner_dim_contiguous, rhs_inner_dim_contiguous, rhs_inner_dim_reordered, Alignment>(buffer);
      return;
    }

    evalGemm<lhs_inner_dim_contiguous, rhs_inner_dim_contiguous, rhs_inner_dim_reordered, Alignment>(buffer);
  }

  template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered, int Alignment>
  void evalGemm(Scalar* buffer) const {
    // columns in left side, rows in right side
    const Index k = this->m_k_size;

    // rows in left side
    const Index m = this->m_i_size;

    // columns in right side
    const Index n = this->m_j_size;

    // zero out the result buffer (which must be of size at least m * n * sizeof(Scalar)
    this->m_device.memset(buffer, 0, m * n * sizeof(Scalar));


    const int lhs_packet_size = internal::unpacket_traits<typename LeftEvaluator::PacketReturnType>::size;
    const int rhs_packet_size = internal::unpacket_traits<typename RightEvaluator::PacketReturnType>::size;

    typedef internal::TensorContractionInputMapper<LhsScalar, Index, internal::Lhs,
                                                   LeftEvaluator, left_nocontract_t,
                                                   contract_t, lhs_packet_size,
                                                   lhs_inner_dim_contiguous,
                                                   false, Unaligned> LhsMapper;

    typedef internal::TensorContractionInputMapper<RhsScalar, Index, internal::Rhs,
                                                   RightEvaluator, right_nocontract_t,
                                                   contract_t, rhs_packet_size,
                                                   rhs_inner_dim_contiguous,
                                                   rhs_inner_dim_reordered, Unaligned> RhsMapper;

    typedef internal::blas_data_mapper<Scalar, Index, ColMajor> OutputMapper;

    // TODO: packing could be faster sometimes if we supported row major tensor mappers
    typedef internal::gemm_pack_lhs<LhsScalar, Index, typename LhsMapper::SubMapper, Traits::mr,
                                    Traits::LhsProgress, ColMajor> LhsPacker;
    typedef internal::gemm_pack_rhs<RhsScalar, Index, typename RhsMapper::SubMapper, Traits::nr, ColMajor> RhsPacker;

    // TODO: replace false, false with conjugate values?
    typedef internal::gebp_kernel<LhsScalar, RhsScalar, Index, OutputMapper,
                                  Traits::mr, Traits::nr, false, false> GebpKernel;

    typedef internal::packLhsArg<LhsScalar, LhsMapper, Index> packLArg;
    typedef internal::packRhsAndKernelArg<LhsScalar, RhsScalar, RhsMapper, OutputMapper, Index> packRKArg;

    // initialize data mappers
    LhsMapper lhs(this->m_leftImpl, this->m_left_nocontract_strides, this->m_i_strides,
                  this->m_left_contracting_strides, this->m_k_strides);

    RhsMapper rhs(this->m_rightImpl, this->m_right_nocontract_strides, this->m_j_strides,
                  this->m_right_contracting_strides, this->m_k_strides);

    OutputMapper output(buffer, m);

    // compute block sizes (which depend on number of threads)
    const Index num_threads = this->m_device.numThreads();
    internal::TensorContractionBlocking<LhsMapper, RhsMapper, Index, internal::ShardByCol> blocking(k, m, n, num_threads);
    Index mc = blocking.mc();
    Index nc = blocking.nc();
    Index kc = blocking.kc();
    eigen_assert(mc <= m);
    eigen_assert(nc <= n);
    eigen_assert(kc <= k);

#define CEIL_DIV(a, b) (((a) + (b) - 1) / (b))
    const Index k_blocks = CEIL_DIV(k, kc);
    const Index n_blocks = CEIL_DIV(n, nc);
    const Index m_blocks = CEIL_DIV(m, mc);
    const Index sizeA = mc * kc;
    const Index sizeB = kc * nc;

    /*    cout << "m: " << m << " n: " << n << " k: " << k << endl;
    cout << "mc: " << mc << " nc: " << nc << " kc: " << kc << endl;
    cout << "m_blocks: " << m_blocks << " n_blocks: " << n_blocks << " k_blocks: " << k_blocks << endl;
    cout << "num threads: " << num_threads << endl;
    */

    // note: m_device.allocate should return 16 byte aligned pointers, but if blockA and blockB
    //       aren't 16 byte aligned segfaults will happen due to SIMD instructions
    // note: You can get away with allocating just a single blockA and offsets and meet the
    //       the alignment requirements with the assumption that
    //       (Traits::mr * sizeof(ResScalar)) % 16 == 0
    const Index numBlockAs = numext::mini(num_threads, m_blocks);
    MaxSizeVector<LhsScalar *> blockAs(num_threads);
    for (int i = 0; i < num_threads; i++) {
      blockAs.push_back(static_cast<LhsScalar *>(this->m_device.allocate(sizeA * sizeof(LhsScalar))));
    }

    // To circumvent alignment issues, I'm just going to separately allocate the memory for each thread
    // TODO: is this too much memory to allocate? This simplifies coding a lot, but is wasteful.
    //       Other options: (1) reuse memory when a thread finishes. con: tricky
    //                      (2) allocate block B memory in each thread. con: overhead
    MaxSizeVector<RhsScalar *> blockBs(n_blocks);
    for (int i = 0; i < n_blocks; i++) {
      blockBs.push_back(static_cast<RhsScalar *>(this->m_device.allocate(sizeB * sizeof(RhsScalar))));
    }

    // lhs_notifications starts with all null Notifications
    MaxSizeVector<Notification*> lhs_notifications(num_threads, nullptr);

    // this should really be numBlockAs * n_blocks;
    const Index num_kernel_notifications = num_threads * n_blocks;
    MaxSizeVector<Notification*> kernel_notifications(num_kernel_notifications,
                                                    nullptr);

    for (Index k_block_idx = 0; k_block_idx < k_blocks; k_block_idx++) {
      const Index k_start = k_block_idx * kc;
      // make sure we don't overshoot right edge of left matrix
      const Index actual_kc = numext::mini(k_start + kc, k) - k_start;

      for (Index m_block_idx = 0; m_block_idx < m_blocks; m_block_idx += numBlockAs) {
        const Index num_blocks = numext::mini(m_blocks-m_block_idx, numBlockAs);

        for (Index mt_block_idx = m_block_idx; mt_block_idx < m_block_idx+num_blocks; mt_block_idx++) {
          const Index m_start = mt_block_idx * mc;
          const Index actual_mc = numext::mini(m_start + mc, m) - m_start;
          eigen_assert(actual_mc > 0);

          Index blockAId = (k_block_idx * m_blocks + mt_block_idx) % num_threads;

          for (int i = 0; i < n_blocks; ++i) {
            Index notification_id = (blockAId * n_blocks + i);
            // Wait for any current kernels using this slot to complete
            // before using it.
            if (kernel_notifications[notification_id]) {
              wait_until_ready(kernel_notifications[notification_id]);
              delete kernel_notifications[notification_id];
            }
            kernel_notifications[notification_id] = new Notification();
          }
          const packLArg arg = {
            blockAs[blockAId], // blockA
            lhs,        // lhs
            m_start,    // m
            k_start,    // k
            actual_mc,  // mc
            actual_kc,  // kc
          };

          // Delete any existing notification since we may be
          // replacing it.  The algorithm should ensure that there are
          // no existing waiters on this notification.
          delete lhs_notifications[blockAId];
          lhs_notifications[blockAId] =
          this->m_device.enqueue(&Self::packLhs<packLArg, LhsPacker>, arg);
        }

        // now start kernels.
        const Index m_base_start = m_block_idx * mc;
        const bool need_to_pack = m_block_idx == 0;

        for (Index n_block_idx = 0; n_block_idx < n_blocks; n_block_idx++) {
          const Index n_start = n_block_idx * nc;
          const Index actual_nc = numext::mini(n_start + nc, n) - n_start;

          // first make sure the previous kernels are all done before overwriting rhs. Also wait if
          // we're going to start new k. In both cases need_to_pack is true.
          if (need_to_pack) {
            for (Index i = num_blocks; i < num_threads; ++i) {
              Index blockAId = (k_block_idx * m_blocks + i + m_block_idx) % num_threads;
              Index future_id = (blockAId * n_blocks + n_block_idx);
              wait_until_ready(kernel_notifications[future_id]);
            }
          }

          packRKArg arg = {
            &blockAs, // blockA
            blockBs[n_block_idx], // blockB
            rhs,          // rhs
            output,       // output
            m_base_start, // m
            k_start,      // k
            n_start,      // n
            mc,           // mc
            actual_kc,    // kc
            actual_nc,    // nc
            num_threads,
            numBlockAs,
            m,
            k_block_idx,
            m_block_idx,
            n_block_idx, // n_block_idx
            m_blocks, // m_blocks
            n_blocks, // n_blocks
            &kernel_notifications, // kernel notifications
            &lhs_notifications,    // lhs notifications
            need_to_pack, // need_to_pack
          };

          // We asynchronously kick off this function, which ends up
          // notifying the appropriate kernel_notifications objects,
          // which this thread waits on before exiting.
          this->m_device.enqueueNoNotification(&Self::packRhsAndKernel<packRKArg, RhsPacker, GebpKernel>, arg);
        }
      }
    }

    // Make sure all the kernels are done.
    for (size_t i = 0; i < kernel_notifications.size(); ++i) {
      wait_until_ready(kernel_notifications[i]);
      delete kernel_notifications[i];
    }

    // No need to wait for lhs notifications since they should have
    // already been waited on.  Just clean them up.
    for (size_t i = 0; i < lhs_notifications.size(); ++i) {
      delete lhs_notifications[i];
    }

    // deallocate all of the memory for both A and B's
    for (size_t i = 0; i < blockAs.size(); i++) {
      this->m_device.deallocate(blockAs[i]);
    }
    for (size_t i = 0; i < blockBs.size(); i++) {
      this->m_device.deallocate(blockBs[i]);
    }

#undef CEIL_DIV
  }

  /*
   * Packs a LHS block of size (mt, kc) starting at lhs(m, k). Before packing
   * the LHS block, check that all of the kernels that worked on the same
   * mt_block_idx in the previous m_block are done.
   */
  template <typename packLArg, typename LhsPacker>
  static void packLhs(const packLArg arg) {
    // perform actual packing
    LhsPacker pack_lhs;
    pack_lhs(arg.blockA, arg.lhs.getSubMapper(arg.m_start, arg.k_start), arg.kc, arg.mc);
  }

  /*
   * Packs a RHS block of size (kc, nc) starting at (k, n) after checking that
   * all kernels in the previous block are done.
   * Then for each LHS future, we wait on the future and then call GEBP
   * on the area packed by the future (which starts at
   * blockA + future_idx * mt * kc) on the LHS and with the full packed
   * RHS block.
   * The output of this GEBP is written to output(m + i * mt, n).
   */
  template <typename packRKArg, typename RhsPacker, typename GebpKernel>
  static void packRhsAndKernel(packRKArg arg) {
    if (arg.need_to_pack) {
      RhsPacker pack_rhs;
      pack_rhs(arg.blockB, arg.rhs.getSubMapper(arg.k, arg.n), arg.kc, arg.nc);
    }

    GebpKernel gebp;
    for (Index mt_block_idx = 0; mt_block_idx < arg.num_blockAs; mt_block_idx++) {
      const Index m_base_start = arg.m + arg.mc*mt_block_idx;
      if (m_base_start < arg.max_m) {
        Index blockAId = (arg.k_block_idx * arg.m_blocks + mt_block_idx + arg.m_block_idx) % arg.num_threads;
        wait_until_ready((*arg.lhs_notifications)[blockAId]);
        const Index actual_mc = numext::mini(m_base_start + arg.mc, arg.max_m) - m_base_start;
        gebp(arg.output.getSubMapper(m_base_start, arg.n),
             (*arg.blockAs)[blockAId], arg.blockB,
             actual_mc, arg.kc, arg.nc, Scalar(1), -1, -1, 0, 0);

        // Notify that the kernel is done.
        const Index set_idx = blockAId * arg.n_blocks + arg.n_block_idx;
        (*arg.kernel_notifications)[set_idx]->Notify();
      }
    }
  }
#endif  // EIGEN_USE_SIMPLE_THREAD_POOL

  TensorOpCost contractionCost(Index m, Index n, Index bm, Index bn, Index bk,
                               bool shard_by_col, bool prepacked) const {
    const int packed_size = std::min<int>(PacketType<LhsScalar, Device>::size,
                                          PacketType<RhsScalar, Device>::size);
    const int output_packet_size = internal::unpacket_traits<PacketReturnType>::size;
    const double kd = static_cast<double>(bk);
    // Peak VFMA bandwidth is 0.5. However if we have not enough data for
    // vectorization bandwidth drops. The 4.0 and 2.0 bandwidth is determined
    // experimentally.
    double computeBandwidth = bk == 1 ? 4.0 :
          (shard_by_col ? bn : bm) < Traits::nr ||
          (shard_by_col ? bm : bn) < Traits::mr ? 2.0 : 0.5;
#ifndef EIGEN_VECTORIZE_FMA
    // Bandwidth of all of VFMA/MULPS/ADDPS is 0.5 on latest Intel processors.
    // However for MULPS/ADDPS we have dependent sequence of 2 such instructions,
    // so overall bandwidth is 1.0.
    if (computeBandwidth == 0.5) computeBandwidth = 1.0;
#endif
    // Computations.
    TensorOpCost cost = TensorOpCost(0, 0, kd * computeBandwidth, true, packed_size);
    // Output stores.
    cost += TensorOpCost(0, sizeof(CoeffReturnType), 0, true, output_packet_size);
    if (prepacked) {
      // Packing and kernels are executed in different tasks. When we calculate
      // task grain size we look only at kernel cost assuming that kernel
      // is more expensive than packing.
      return cost;
    }
    // Lhs/rhs loads + computations.
    TensorOpCost lhsCost = this->m_leftImpl.costPerCoeff(true) * (kd / n);
    TensorOpCost rhsCost = this->m_rightImpl.costPerCoeff(true) * (kd / m);
    // Lhs packing memory cost does not contribute considerably to overall
    // execution time because lhs is prefetched early and accessed sequentially.
    if (shard_by_col)
      lhsCost.dropMemoryCost();
    else
      rhsCost.dropMemoryCost();
    return cost + lhsCost + rhsCost;
  }
};

} // end namespace Eigen

#endif  // EIGEN_USE_THREADS
#endif // EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_THREAD_POOL_H