Home | History | Annotate | Download | only in internal
      1 // Copyright 2015 The Gemmlowp Authors. All Rights Reserved.
      2 //
      3 // Licensed under the Apache License, Version 2.0 (the "License");
      4 // you may not use this file except in compliance with the License.
      5 // You may obtain a copy of the License at
      6 //
      7 //     http://www.apache.org/licenses/LICENSE-2.0
      8 //
      9 // Unless required by applicable law or agreed to in writing, software
     10 // distributed under the License is distributed on an "AS IS" BASIS,
     11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     12 // See the License for the specific language governing permissions and
     13 // limitations under the License.
     14 
     15 // multi_thread_gemm.h: Multi-threaded GEMM entry point.
     16 // Readers note: To understand this file, it is useful to first
     17 // read and understand the much simpler single_thread_gemm.h.
     18 
     19 #ifndef GEMMLOWP_INTERNAL_MULTI_THREAD_GEMM_H_
     20 #define GEMMLOWP_INTERNAL_MULTI_THREAD_GEMM_H_
     21 
     22 #include <pthread.h>
     23 #include <unistd.h>
     24 #include <vector>
     25 
     26 #include "single_thread_gemm.h"
     27 
     28 namespace gemmlowp {
     29 
     30 // On X86 and ARM platforms we enable a busy-wait spinlock before waiting on a
     31 // pthread conditional variable. In order to implement that correctly we need
     32 // to put some explicit memory load/store barriers.
     33 
     34 #if defined(GEMMLOWP_ALLOW_INLINE_ASM) && !defined(GEMMLOWP_NO_BUSYWAIT) && \
     35     (defined(GEMMLOWP_ARM) || defined(GEMMLOWP_X86))
     36 
     37 #define GEMMLOWP_USE_BUSYWAIT
     38 
     39 const int kMaxBusyWaitNOPs = 32 * 1000 * 1000;
     40 
     41 #define GEMMLOWP_NOP "nop\n"
     42 
     43 #define GEMMLOWP_STRING_CONCAT_4(X) X X X X
     44 #define GEMMLOWP_NOP4 GEMMLOWP_STRING_CONCAT_4(GEMMLOWP_NOP)
     45 #define GEMMLOWP_NOP16 GEMMLOWP_STRING_CONCAT_4(GEMMLOWP_NOP4)
     46 #define GEMMLOWP_NOP64 GEMMLOWP_STRING_CONCAT_4(GEMMLOWP_NOP16)
     47 
     48 inline int Do256NOPs() {
     49   asm volatile(GEMMLOWP_NOP64);
     50   return 64;
     51 }
     52 
     53 #undef GEMMLOWP_STRING_CONCAT_4
     54 #undef GEMMLOWP_NOP256
     55 #undef GEMMLOWP_NOP64
     56 #undef GEMMLOWP_NOP16
     57 #undef GEMMLOWP_NOP4
     58 #undef GEMMLOWP_NOP
     59 
     60 inline void WriteBarrier() {
     61 #ifdef GEMMLOWP_ARM_32
     62   MemoryBarrier();
     63 #elif defined(GEMMLOWP_ARM_64)
     64   asm volatile("dmb ishst" ::: "memory");
     65 #elif defined(GEMMLOWP_X86)
     66   asm volatile("sfence" ::: "memory");
     67 #else
     68 #error "Unsupported architecture for WriteBarrier."
     69 #endif
     70 }
     71 
     72 inline void ReadBarrier() {
     73 #ifdef GEMMLOWP_ARM_32
     74   MemoryBarrier();
     75 #elif defined(GEMMLOWP_ARM_64)
     76   asm volatile("dmb ishld" ::: "memory");
     77 #elif defined(GEMMLOWP_X86)
     78   asm volatile("lfence" ::: "memory");
     79 #else
     80 #error "Unsupported architecture for ReadBarrier."
     81 #endif
     82 }
     83 
     84 #endif
     85 
     86 // Waits until *var != initial_value.
     87 //
     88 // Returns the new value of *var. The guarantee here is that
     89 // the return value is different from initial_value, and that that
     90 // new value has been taken by *var at some point during the
     91 // execution of this function. There is no guarantee that this is
     92 // still the value of *var when this function returns, since *var is
     93 // not assumed to be guarded by any lock.
     94 //
     95 // First does some busy-waiting for a fixed number of no-op cycles,
     96 // then falls back to passive waiting for the given condvar, guarded
     97 // by the given mutex.
     98 //
     99 // The idea of doing some initial busy-waiting is to help get
    100 // better and more consistent multithreading benefits for small GEMM sizes.
    101 // Busy-waiting help ensuring that if we need to wake up soon after having
    102 // started waiting, then we can wake up quickly (as opposed to, say,
    103 // having to wait to be scheduled again by the OS). On the other hand,
    104 // we must still eventually revert to passive waiting for longer waits
    105 // (e.g. worker threads having finished a GEMM and waiting until the next GEMM)
    106 // so as to avoid permanently spinning.
    107 //
    108 template <typename T>
    109 T WaitForVariableChange(volatile T* var, T initial_value, pthread_cond_t* cond,
    110                         pthread_mutex_t* mutex) {
    111 #ifdef GEMMLOWP_USE_BUSYWAIT
    112   // If we are on a platform that supports it, spin for some time.
    113   {
    114     int nops = 0;
    115     // First, trivial case where the variable already changed value.
    116     T new_value = *var;
    117     if (new_value != initial_value) {
    118       ReadBarrier();
    119       return new_value;
    120     }
    121     // Then try busy-waiting.
    122     while (nops < kMaxBusyWaitNOPs) {
    123       nops += Do256NOPs();
    124       new_value = *var;
    125       if (new_value != initial_value) {
    126         ReadBarrier();
    127         return new_value;
    128       }
    129     }
    130   }
    131 #endif
    132 
    133   // Finally, do real passive waiting.
    134   pthread_mutex_lock(mutex);
    135   T new_value = *var;
    136   if (new_value == initial_value) {
    137     pthread_cond_wait(cond, mutex);
    138     new_value = *var;
    139     assert(new_value != initial_value);
    140   }
    141   pthread_mutex_unlock(mutex);
    142   return new_value;
    143 }
    144 
    145 // A BlockingCounter lets one thread to wait for N events to occur.
    146 // This is how the master thread waits for all the worker threads
    147 // to have finished working.
    148 class BlockingCounter {
    149  public:
    150   BlockingCounter()
    151       : cond_(PTHREAD_COND_INITIALIZER),
    152         mutex_(PTHREAD_MUTEX_INITIALIZER),
    153         count_(0),
    154         initial_count_(0) {}
    155 
    156   // Sets/resets the counter; initial_count is the number of
    157   // decrementing events that the Wait() call will be waiting for.
    158   void Reset(std::size_t initial_count) {
    159     pthread_mutex_lock(&mutex_);
    160     assert(count_ == 0);
    161     initial_count_ = initial_count;
    162     count_ = initial_count_;
    163     pthread_mutex_unlock(&mutex_);
    164   }
    165 
    166   // Decrements the counter; if the counter hits zero, signals
    167   // the thread that was waiting for that, and returns true.
    168   // Otherwise (if the decremented count is still nonzero),
    169   // returns false.
    170   bool DecrementCount() {
    171     pthread_mutex_lock(&mutex_);
    172     assert(count_ > 0);
    173     count_--;
    174 #ifdef GEMMLOWP_USE_BUSYWAIT
    175     WriteBarrier();
    176 #endif
    177     if (count_ == 0) {
    178       pthread_cond_signal(&cond_);
    179     }
    180     bool retval = count_ == 0;
    181     pthread_mutex_unlock(&mutex_);
    182     return retval;
    183   }
    184 
    185   // Waits for the N other threads (N having been set by Reset())
    186   // to hit the BlockingCounter.
    187   void Wait() {
    188     ScopedProfilingLabel label("BlockingCounter::Wait");
    189     while (count_) {
    190       MemoryBarrier();
    191       const std::size_t count_value = count_;
    192       if (count_value) {
    193         WaitForVariableChange(&count_, count_value, &cond_, &mutex_);
    194       }
    195     }
    196   }
    197 
    198  private:
    199   pthread_cond_t cond_;
    200   pthread_mutex_t mutex_;
    201   std::size_t count_;
    202   std::size_t initial_count_;
    203 };
    204 
    205 // A workload for a worker.
    206 struct Task {
    207   Task() : local_allocator(nullptr) {}
    208   virtual ~Task() {}
    209   virtual void Run() = 0;
    210   Allocator* local_allocator;
    211 };
    212 
    213 // A worker thread.
    214 class Worker {
    215  public:
    216   enum class State {
    217     ThreadStartup,  // The initial state before the thread main loop runs.
    218     Ready,          // Is not working, has not yet received new work to do.
    219     HasWork,        // Has work to do.
    220     ExitAsSoonAsPossible  // Should exit at earliest convenience.
    221   };
    222 
    223   explicit Worker(BlockingCounter* counter_to_decrement_when_ready)
    224       : task_(nullptr),
    225         state_cond_(PTHREAD_COND_INITIALIZER),
    226         state_mutex_(PTHREAD_MUTEX_INITIALIZER),
    227         state_(State::ThreadStartup),
    228         counter_to_decrement_when_ready_(counter_to_decrement_when_ready) {
    229     pthread_create(&thread_, nullptr, ThreadFunc, this);
    230   }
    231 
    232   ~Worker() {
    233     ChangeState(State::ExitAsSoonAsPossible);
    234     pthread_join(thread_, nullptr);
    235   }
    236 
    237   // Changes State; may be called from either the worker thread
    238   // or the master thread; however, not all state transitions are legal,
    239   // which is guarded by assertions.
    240   void ChangeState(State new_state) {
    241     ScopedProfilingLabel label("Worker::ChangeState");
    242     pthread_mutex_lock(&state_mutex_);
    243     assert(new_state != state_);
    244     switch (state_) {
    245       case State::ThreadStartup:
    246         assert(new_state == State::Ready);
    247         break;
    248       case State::Ready:
    249         assert(new_state == State::HasWork ||
    250                new_state == State::ExitAsSoonAsPossible);
    251         break;
    252       case State::HasWork:
    253         assert(new_state == State::Ready ||
    254                new_state == State::ExitAsSoonAsPossible);
    255         break;
    256       default:
    257         abort();
    258     }
    259     state_ = new_state;
    260     pthread_cond_signal(&state_cond_);
    261     if (state_ == State::Ready) {
    262       counter_to_decrement_when_ready_->DecrementCount();
    263     }
    264     pthread_mutex_unlock(&state_mutex_);
    265   }
    266 
    267   // Thread entry point.
    268   void ThreadFunc() {
    269     ScopedProfilingLabel label("Worker::ThreadFunc");
    270     RegisterCurrentThreadForProfiling();
    271 
    272     ChangeState(State::Ready);
    273 
    274     // Thread main loop
    275     while (true) {
    276       // Get a state to act on
    277       // In the 'Ready' state, we have nothing to do but to wait until
    278       // we switch to another state.
    279       State state_to_act_upon = WaitForVariableChange(
    280           &state_, State::Ready, &state_cond_, &state_mutex_);
    281 
    282       // We now have a state to act on, so act.
    283       switch (state_to_act_upon) {
    284         case State::HasWork:
    285           // Got work to do! So do it, and then revert to 'Ready' state.
    286           assert(task_);
    287           task_->Run();
    288           task_ = nullptr;
    289           ChangeState(State::Ready);
    290           break;
    291         case State::ExitAsSoonAsPossible:
    292           return;
    293         default:
    294           abort();
    295       }
    296     }
    297   }
    298 
    299   static void* ThreadFunc(void* arg) {
    300     static_cast<Worker*>(arg)->ThreadFunc();
    301     return nullptr;
    302   }
    303 
    304   // Called by the master thead to give this worker work to do.
    305   // It is only legal to call this if the worker
    306   void StartWork(Task* task) {
    307     assert(!task_);
    308     task->local_allocator = &local_allocator_;
    309     task_ = task;
    310 #ifdef GEMMLOWP_USE_BUSYWAIT
    311     WriteBarrier();
    312 #endif
    313     assert(state_ == State::Ready);
    314     ChangeState(State::HasWork);
    315   }
    316 
    317  private:
    318   // The underlying thread.
    319   pthread_t thread_;
    320 
    321   // The task to be worked on.
    322   Task* task_;
    323 
    324   // The condition variable and mutex guarding state changes.
    325   pthread_cond_t state_cond_;
    326   pthread_mutex_t state_mutex_;
    327 
    328   // The state enum tells if we're currently working, waiting for work, etc.
    329   State state_;
    330 
    331   // Each thread had a local allocator so they can allocate temporary
    332   // buffers without blocking each other.
    333   Allocator local_allocator_;
    334 
    335   // pointer to the master's thread BlockingCounter object, to notify the
    336   // master thread of when this worker switches to the 'Ready' state.
    337   BlockingCounter* const counter_to_decrement_when_ready_;
    338 };
    339 
    340 // A very simple pool of workers, that only allows the very
    341 // specific parallelization pattern that we use here:
    342 // a fixed number of workers can be given work, and one then
    343 // waits for all of them to finish.
    344 //
    345 // See MultiThreadGemmContextBase for how other WorkersPool implementations can
    346 // be used. Note that in those implementations, StartWorker can be free to
    347 // ignore the <index> value; that is, the caller of WorkersPool does not rely on
    348 // <index> to order tasks with equal <index>.
    349 class WorkersPool {
    350  public:
    351   WorkersPool() {}
    352 
    353   ~WorkersPool() {
    354     for (auto w : workers_) {
    355       delete w;
    356     }
    357   }
    358 
    359   void Execute(const std::vector<Task*>& tasks) {
    360     assert(tasks.size() >= 1);
    361     // One of the tasks will be run on the current thread.
    362     int workers_count = tasks.size() - 1;
    363     CreateWorkers(workers_count);
    364     assert(workers_count <= workers_.size());
    365     counter_to_decrement_when_ready_.Reset(workers_count);
    366     int n = 0;
    367     std::for_each(tasks.begin(), --tasks.end(), [this, &n](Task *task) {
    368       workers_[n++]->StartWork(task);
    369     });
    370     // Execute the remaining workload immediately on the current thread.
    371     Task* task = tasks.back();
    372     task->local_allocator = &main_thread_task_allocator_;
    373     task->Run();
    374     // Wait for the workers submitted above to finish.
    375     counter_to_decrement_when_ready_.Wait();
    376     // Cleanup tasks (best to do this from the same thread that allocated
    377     // the memory).
    378     std::for_each(tasks.begin(), tasks.end(), [](Task *task) {
    379       delete task;
    380     });
    381   }
    382 
    383  private:
    384   // Ensures that the pool has at least the given count of workers.
    385   // If any new worker has to be created, this function waits for it to
    386   // be ready.
    387   void CreateWorkers(std::size_t workers_count) {
    388     if (workers_.size() >= workers_count) {
    389       return;
    390     }
    391     counter_to_decrement_when_ready_.Reset(workers_count - workers_.size());
    392     while (workers_.size() < workers_count) {
    393       workers_.push_back(new Worker(&counter_to_decrement_when_ready_));
    394     }
    395     counter_to_decrement_when_ready_.Wait();
    396   }
    397 
    398   // copy construction disallowed
    399   WorkersPool(const WorkersPool&) = delete;
    400 
    401   // The workers in this pool. They are owned by the pool:
    402   // the pool creates workers and destroys them in its destructor.
    403   std::vector<Worker*> workers_;
    404 
    405   // The BlockingCounter used to wait for the workers.
    406   BlockingCounter counter_to_decrement_when_ready_;
    407 
    408   // For N-threaded operations, we will use only N-1 worker threads
    409   // while the last task will be run directly on the main thread.
    410   // It will then use this main_thread_task_allocator_; having a
    411   // dedicated allocator for that (separate from the base allocator_)
    412   // allows to use the same code for all tasks regardless of which
    413   // thread they run on.
    414   Allocator main_thread_task_allocator_;
    415 };
    416 
    417 // The task we use to implement a multi-threaded Gemm: a block of the
    418 // RHS has been packed by the master thread; each worker thread
    419 // then has to pack a block of the LHS and accumulate the Gemm of these
    420 // packed LHS and RHS blocks.
    421 template <typename KernelFormat, typename InputScalar, typename OutputScalar,
    422           typename BitDepthParams, MapOrder LhsOrder, MapOrder RhsOrder,
    423           MapOrder ResultOrder, typename LhsOffset, typename RhsOffset,
    424   typename OutputPipelineType, typename GemmContextType>
    425 struct GemmWithPackedRhsTask : Task {
    426   typedef PackedSideBlock<typename KernelFormat::Lhs> PackedLhs;
    427   typedef PackedSideBlock<typename KernelFormat::Rhs> PackedRhs;
    428   GemmWithPackedRhsTask(GemmContextType* _context,
    429                         const KernelBase& _kernel,
    430                         const MatrixMap<const InputScalar, LhsOrder>& _lhs,
    431                         const PackedRhs& _packed_rhs,
    432                         MatrixMap<OutputScalar, ResultOrder>* _result,
    433                         const MatrixBlockBounds& _result_block,
    434                         const LhsOffset& _lhs_offset,
    435                         const RhsOffset& _rhs_offset,
    436                         const OutputPipelineType& _output_pipeline)
    437       : context(_context),
    438         kernel(_kernel),
    439         lhs(_lhs),
    440         packed_rhs(_packed_rhs),
    441         result(*_result),
    442         result_block(_result_block),
    443         lhs_offset(_lhs_offset),
    444         rhs_offset(_rhs_offset),
    445         output_pipeline(_output_pipeline) {}
    446 
    447   void Run() override {
    448     ScopedProfilingLabel label("GemmWithPackedRhsTask");
    449 
    450     const int rows = result_block.rows;
    451     const int cols = result_block.cols;
    452     const int depth = lhs.cols();
    453 
    454     BlockParams block_params;
    455     block_params.Init<KernelFormat>(rows, cols, depth, 1,
    456                                     context->l1_bytes_to_use(),
    457                                     context->l2_bytes_to_use(),
    458                                     context->l2_rhs_factor());
    459 
    460     PackedLhs packed_lhs(Side::Lhs, local_allocator, block_params);
    461 
    462     PackedResult packed_result(local_allocator, block_params);
    463 
    464     local_allocator->Commit();
    465 
    466     for (int c = 0; c < cols; c += block_params.l2_cols) {
    467       int cs = std::min(block_params.l2_cols, cols - c);
    468 
    469       for (int r = 0; r < rows; r += block_params.l2_rows) {
    470         int rs = std::min(block_params.l2_rows, rows - r);
    471 
    472         PackLhs(&packed_lhs, lhs.block(r, 0, rs, depth));
    473 
    474         Compute(kernel, block_params, &packed_result, packed_lhs, packed_rhs,
    475                 depth);
    476 
    477         auto curr_result_block = MatrixBlockBounds(
    478             result_block.start_row + r, result_block.start_col + c, rs, cs);
    479         UnpackResult<KernelFormat>(
    480             &result, curr_result_block, packed_result, depth,
    481             packed_lhs.sums_of_each_slice(), packed_rhs.sums_of_each_slice(),
    482             lhs_offset.block(curr_result_block.start_row, rs),
    483             rhs_offset.block(curr_result_block.start_col, cs), output_pipeline);
    484       }
    485     }
    486 
    487     local_allocator->Decommit();
    488   }
    489 
    490   const GemmContextType* context;
    491   const KernelBase& kernel;
    492   const MatrixMap<const InputScalar, LhsOrder> lhs;
    493   const PackedRhs packed_rhs;
    494   MatrixMap<OutputScalar, ResultOrder> result;
    495   const MatrixBlockBounds result_block;
    496   const LhsOffset& lhs_offset;
    497   const RhsOffset& rhs_offset;
    498   const OutputPipelineType& output_pipeline;
    499 };
    500 
    501 // This base class for multi-threading allows subclasses to implement their own
    502 // workers_pool() method.  See MultiThreadGemmContext below for an example;
    503 // any other implementation of workers_pool() must return an object with the
    504 // same public methods as WorkersPool.
    505 class MultiThreadGemmContextBase : public SingleThreadGemmContext {
    506  public:
    507   void set_max_num_threads(int n) { max_num_threads_ = n; }
    508 
    509   int max_num_threads() const { return max_num_threads_; }
    510 
    511  protected:
    512   // The maximum number of worker threads to use (including
    513   // the master thread).
    514   // The default value 1 means single-threading. That is the default
    515   // because gemmlowp's primary target is mobile hardware, where thermal
    516   // constraints usually mean that it may not be realistic to use more
    517   // than 1 CPU core even if multiple cores are present.
    518   // The special value 0 means try to detect the number of hardware threads.
    519   // Note: this assumes that all CPU cores are equivalent. That assumption
    520   // is defeated on big.LITTLE ARM devices, where we have no API to query
    521   // the number of big cores (which is typically what we would want to use,
    522   // leaving aside above-mentioned thermal issues). That is the other reason
    523   // why the best compromise here is to let max_num_threads_ default to 1,
    524   // so users who want multi-threading have to make the decision of how many
    525   // threads to use by themselves.
    526   int max_num_threads_ = 1;
    527 };
    528 
    529 class MultiThreadGemmContext : public MultiThreadGemmContextBase {
    530  public:
    531   WorkersPool* workers_pool() { return &workers_pool_; }
    532 
    533  private:
    534   // The workers pool used by MultiThreadGemm. Making
    535   // this part of the context allows it to be persistent,
    536   // avoiding recreating threads on every Gemm.
    537   WorkersPool workers_pool_;
    538 };
    539 
    540 // Needed by chrome native builds
    541 #ifndef _SC_NPROCESSORS_CONF
    542 #define _SC_NPROCESSORS_CONF _SC_NPROCESSORS_ONLN
    543 #endif
    544 
    545 // Determines how many threads should be used for a given Gemm
    546 // operation.
    547 template <int KernelRows>
    548 inline int HowManyThreads(int max_num_threads, int rows, int cols, int depth) {
    549   // Early-exit in the default case where multi-threading is disabled.
    550   if (max_num_threads == 1) {
    551     return 1;
    552   }
    553 
    554   // Determine the maximum number of threads.
    555   int max_count = max_num_threads;
    556   // The special value 0 means try to determine the total number of cores.
    557   if (max_count == 0) {
    558     // No user-set maximum number of threads, so we need to
    559     // do some hardware detection.
    560     // This is expensive to query so we do it only once.
    561     // Too bad for dynamicness. Also, we dont use the c++11 standard getter
    562     // because Google's coding style currently bans #include <thread_>.
    563     static const int hardware_threads_count =
    564         static_cast<int>(sysconf(_SC_NPROCESSORS_CONF));
    565 
    566     max_count = hardware_threads_count;
    567   }
    568 
    569   // Basic calculation: take into account max pool size, and
    570   // how many rows we have to feed our kernel.
    571   // The motivation for an absolute minimum number of rows per thread,
    572   // potentially higher than KernelRows, is that very thin thread workload
    573   // currently defeat assumptions of the AddMod generator, resulting
    574   // in substantial bias in TestWithRealData on 24 threads.
    575   // Ideally, the AddMod generator should be aware of global (r,c) coordinates
    576   // so as to be independent of the number of threads.
    577   static const int AbsoluteMinRowsPerThread = 16;
    578   static const int MinRowsPerThread = KernelRows > AbsoluteMinRowsPerThread
    579                                           ? KernelRows
    580                                           : AbsoluteMinRowsPerThread;
    581   int thread_count = std::min(max_count, CeilQuotient(rows, MinRowsPerThread));
    582 
    583   // At this point for small products we already have thread_count==1 so
    584   // we can avoid doing more work; otherwise, we still want to check
    585   // that the cubic size (rows*cols*depth) is big enough to keep
    586   // workers_ busy.
    587   if (thread_count > 1) {
    588     // Empirically determined value.
    589     static const std::uint64_t min_cubic_size_per_thread = 64 * 1024;
    590 
    591     // We can only multiply two out of three sizes without risking overflow
    592     const std::uint64_t cubic_size =
    593         std::uint64_t(rows) * std::uint64_t(cols) * std::uint64_t(depth);
    594 
    595     thread_count =
    596         std::min(thread_count, int(cubic_size / min_cubic_size_per_thread));
    597 
    598     if (thread_count < 1) {
    599       thread_count = 1;
    600     }
    601   }
    602 
    603   assert(thread_count > 0 && thread_count <= max_count);
    604   return thread_count;
    605 }
    606 
    607 // The main multi-threaded Gemm function.
    608 // To understand it, first read the code of SingleThreadGemm().
    609 // The parallelization scheme used here is to have this master function
    610 // pack a block of RHS and then start worker threads to pack a block of LHS
    611 // each, and accumulate the corresponding products.
    612 template <typename KernelFormat, typename InputScalar, typename OutputScalar,
    613           typename BitDepthParams, MapOrder LhsOrder, MapOrder RhsOrder,
    614           MapOrder ResultOrder, typename LhsOffset, typename RhsOffset,
    615           typename OutputPipelineType, typename GemmContextType>
    616 void MultiThreadGemm(GemmContextType* context, const KernelBase& kernel,
    617                      const MatrixMap<const InputScalar, LhsOrder>& lhs,
    618                      const MatrixMap<const InputScalar, RhsOrder>& rhs,
    619                      MatrixMap<OutputScalar, ResultOrder>* result,
    620                      const LhsOffset& lhs_offset, const RhsOffset& rhs_offset,
    621                      const OutputPipelineType& output_pipeline) {
    622   ScopedProfilingLabel label("gemmlowp::MultiThreadGemm");
    623 
    624   assert(lhs.cols() == rhs.rows());
    625 
    626   int rows = result->rows();
    627   int cols = result->cols();
    628   int depth = lhs.cols();
    629 
    630   // zero sizes should have been caught earlier and early-returned.
    631   assert(rows > 0);
    632   assert(cols > 0);
    633   assert(depth > 0);
    634 
    635   // The case of rows<cols should have been caught earlier and transposed.
    636   assert(rows >= cols);
    637 
    638   const int thread_count = HowManyThreads<KernelFormat::kRows>(
    639       context->max_num_threads(), rows, cols, depth);
    640   if (thread_count == 1) {
    641     return SingleThreadGemm<KernelFormat, InputScalar, OutputScalar,
    642                             BitDepthParams>(context, kernel, lhs, rhs, result,
    643                                             lhs_offset, rhs_offset,
    644                                             output_pipeline);
    645   }
    646   assert(thread_count > 1);
    647 
    648   // Simple 1:1 mapping of tasks to physical cores, which is very important
    649   // to getting good multithreaded performance, specially for not-very-large
    650   // GEMMs, and especially on Android.
    651   const int task_count = thread_count;
    652 
    653   Allocator* allocator = context->allocator();
    654   auto* workers_pool = context->workers_pool();
    655 
    656   BlockParams block_params;
    657   block_params.Init<KernelFormat>(rows, cols, depth, task_count,
    658                                   context->l1_bytes_to_use(),
    659                                   context->l2_bytes_to_use(),
    660                                   context->l2_rhs_factor());
    661 
    662   PackedSideBlock<typename KernelFormat::Rhs> packed_rhs(Side::Rhs, allocator,
    663                                                          block_params);
    664   allocator->Commit();
    665 
    666   // We loop over large blocks of the RHS.
    667   for (int c = 0; c < cols; c += block_params.l2_cols) {
    668     int cs = std::min(block_params.l2_cols, cols - c);
    669 
    670     // Pack a large block of the RHS.
    671     PackRhs(&packed_rhs, rhs.block(0, c, depth, cs));
    672 
    673     // Give work to each worker.
    674     std::vector<Task*> tasks;
    675     int next_start_row = 0;
    676     for (int n = 0; n < task_count; ++n) {
    677       int start_row = next_start_row;
    678       next_start_row = std::min(rows, RoundUp<KernelFormat::kRows>(
    679                                           rows * (n + 1) / task_count));
    680 
    681       int block_rows = next_start_row - start_row;
    682       auto lhs_block = lhs.block(start_row, 0, block_rows, depth);
    683       typedef GemmWithPackedRhsTask<
    684           KernelFormat, InputScalar, OutputScalar, BitDepthParams, LhsOrder,
    685           RhsOrder, ResultOrder, LhsOffset, RhsOffset, OutputPipelineType,
    686           GemmContextType>
    687           TaskType;
    688       tasks.push_back(new TaskType(context, kernel, lhs_block, packed_rhs, result,
    689                                    MatrixBlockBounds(start_row, c, block_rows, cs),
    690                                    lhs_offset, rhs_offset, output_pipeline));
    691     }
    692     // Execute the work on the workers (and partially on this thread).
    693     workers_pool->Execute(tasks);
    694   }
    695 
    696   allocator->Decommit();
    697 }
    698 
    699 }  // namespace gemmlowp
    700 
    701 #endif  // GEMMLOWP_INTERNAL_MULTI_THREAD_GEMM_H_
    702