Home | History | Annotate | Download | only in internal
      1 // Copyright 2015 Google Inc. All Rights Reserved.
      2 //
      3 // Licensed under the Apache License, Version 2.0 (the "License");
      4 // you may not use this file except in compliance with the License.
      5 // You may obtain a copy of the License at
      6 //
      7 //     http://www.apache.org/licenses/LICENSE-2.0
      8 //
      9 // Unless required by applicable law or agreed to in writing, software
     10 // distributed under the License is distributed on an "AS IS" BASIS,
     11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     12 // See the License for the specific language governing permissions and
     13 // limitations under the License.
     14 
     15 // multi_thread_gemm.h: Multi-threaded GEMM entry point.
     16 // Readers note: To understand this file, it is useful to first
     17 // read and understand the much simpler single_thread_gemm.h.
     18 
     19 #ifndef GEMMLOWP_INTERNAL_MULTI_THREAD_GEMM_H_
     20 #define GEMMLOWP_INTERNAL_MULTI_THREAD_GEMM_H_
     21 
     22 #include <pthread.h>
     23 #include <unistd.h>
     24 #include <vector>
     25 
     26 #include "single_thread_gemm.h"
     27 
     28 namespace gemmlowp {
     29 
     30 #ifdef GEMMLOWP_ALLOW_INLINE_ASM
     31 // Where inline asm is allowed, we use some busy-waiting,
     32 // preferably implemented using NOP instructions.
     33 const int kMaxBusyWaitNOPs = 32 * 1000 * 1000;
     34 
     35 #define GEMMLOWP_NOP "nop\n"
     36 
     37 #define GEMMLOWP_STRING_CONCAT_4(X) X X X X
     38 #define GEMMLOWP_NOP4 GEMMLOWP_STRING_CONCAT_4(GEMMLOWP_NOP)
     39 #define GEMMLOWP_NOP16 GEMMLOWP_STRING_CONCAT_4(GEMMLOWP_NOP4)
     40 #define GEMMLOWP_NOP64 GEMMLOWP_STRING_CONCAT_4(GEMMLOWP_NOP16)
     41 #define GEMMLOWP_NOP256 GEMMLOWP_STRING_CONCAT_4(GEMMLOWP_NOP64)
     42 
     43 inline int Do256NOPs() {
     44   asm volatile(GEMMLOWP_NOP256);
     45   return 256;
     46 }
     47 
     48 #undef GEMMLOWP_STRING_CONCAT_4
     49 #undef GEMMLOWP_NOP256
     50 #undef GEMMLOWP_NOP64
     51 #undef GEMMLOWP_NOP16
     52 #undef GEMMLOWP_NOP4
     53 #undef GEMMLOWP_NOP
     54 
     55 #else  // not GEMMLOWP_ALLOW_INLINE_ASM
     56 
     57 // It is nontrivial to implement a good busy-waiting without
     58 // using asm; NOP instructions have the least side effects
     59 // and the lowest power usage; and since the whole busy-waiting
     60 // story is an optimization, it's not very interesting anyway
     61 // in places where we're slow anyway due to not being able to
     62 // use our inline asm kernels.
     63 
     64 const int kMaxBusyWaitNOPs = 0;
     65 inline int Do256NOPs() { return 0; }
     66 
     67 #endif  // not GEMMLOWP_ALLOW_INLINE_ASM
     68 
     69 inline void WriteBarrier() {
     70 #ifdef GEMMLOWP_ARM_32
     71   MemoryBarrier();
     72 #elif defined(GEMMLOWP_ARM_64)
     73   asm volatile("dmb ishst" ::: "memory");
     74 #elif defined(GEMMLOWP_X86)
     75   asm volatile("sfence" ::: "memory");
     76 #elif defined(__mips__)
     77   MemoryBarrier();
     78 #else
     79 #error "Unsupported architecture for WriteBarrier."
     80 #endif
     81 }
     82 
     83 inline void ReadBarrier() {
     84 #ifdef GEMMLOWP_ARM_32
     85   MemoryBarrier();
     86 #elif defined(GEMMLOWP_ARM_64)
     87   asm volatile("dmb ishld" ::: "memory");
     88 #elif defined(GEMMLOWP_X86)
     89   asm volatile("lfence" ::: "memory");
     90 #elif defined(__mips__)
     91   MemoryBarrier();
     92 #else
     93 #error "Unsupported architecture for ReadBarrier."
     94 #endif
     95 }
     96 
     97 // Waits until *var != initial_value.
     98 //
     99 // Returns the new value of *var. The guarantee here is that
    100 // the return value is different from initial_value, and that that
    101 // new value has been taken by *var at some point during the
    102 // execution of this function. There is no guarantee that this is
    103 // still the value of *var when this function returns, since *var is
    104 // not assumed to be guarded by any lock.
    105 //
    106 // First does some busy-waiting for a fixed number of no-op cycles,
    107 // then falls back to passive waiting for the given condvar, guarded
    108 // by the given mutex.
    109 //
    110 // The idea of doing some initial busy-waiting is to help get
    111 // better and more consistent multithreading benefits for small GEMM sizes.
    112 // Busy-waiting help ensuring that if we need to wake up soon after having
    113 // started waiting, then we can wake up quickly (as opposed to, say,
    114 // having to wait to be scheduled again by the OS). On the other hand,
    115 // we must still eventually revert to passive waiting for longer waits
    116 // (e.g. worker threads having finished a GEMM and waiting until the next GEMM)
    117 // so as to avoid permanently spinning.
    118 //
    119 template <typename T>
    120 T WaitForVariableChange(volatile T* var, T initial_value, pthread_cond_t* cond,
    121                         pthread_mutex_t* mutex) {
    122   int nops = 0;
    123   // First, trivial case where the variable already changed value.
    124   T new_value = *var;
    125   if (new_value != initial_value) {
    126     return new_value;
    127   }
    128   // Then try busy-waiting.
    129   while (nops < kMaxBusyWaitNOPs) {
    130     nops += Do256NOPs();
    131     new_value = *var;
    132     if (new_value != initial_value) {
    133       return new_value;
    134     }
    135   }
    136   // Finally, do real passive waiting.
    137   pthread_mutex_lock(mutex);
    138   new_value = *var;
    139   if (new_value == initial_value) {
    140     pthread_cond_wait(cond, mutex);
    141     new_value = *var;
    142     assert(new_value != initial_value);
    143   }
    144   pthread_mutex_unlock(mutex);
    145   return new_value;
    146 }
    147 
    148 // A BlockingCounter lets one thread to wait for N events to occur.
    149 // This is how the master thread waits for all the worker threads
    150 // to have finished working.
    151 class BlockingCounter {
    152  public:
    153   BlockingCounter()
    154       : cond_(PTHREAD_COND_INITIALIZER),
    155         mutex_(PTHREAD_MUTEX_INITIALIZER),
    156         count_(0),
    157         initial_count_(0) {}
    158 
    159   // Sets/resets the counter; initial_count is the number of
    160   // decrementing events that the Wait() call will be waiting for.
    161   void Reset(std::size_t initial_count) {
    162     pthread_mutex_lock(&mutex_);
    163     assert(count_ == 0);
    164     initial_count_ = initial_count;
    165     count_ = initial_count_;
    166     pthread_mutex_unlock(&mutex_);
    167   }
    168 
    169   // Decrements the counter; if the counter hits zero, signals
    170   // the thread that was waiting for that, and returns true.
    171   // Otherwise (if the decremented count is still nonzero),
    172   // returns false.
    173   bool DecrementCount() {
    174     pthread_mutex_lock(&mutex_);
    175     assert(count_ > 0);
    176     count_--;
    177     if (count_ == 0) {
    178       pthread_cond_signal(&cond_);
    179     }
    180     bool retval = count_ == 0;
    181     pthread_mutex_unlock(&mutex_);
    182     return retval;
    183   }
    184 
    185   // Waits for the N other threads (N having been set by Reset())
    186   // to hit the BlockingCounter.
    187   void Wait() {
    188     ScopedProfilingLabel label("BlockingCounter::Wait");
    189     while (count_) {
    190       MemoryBarrier();
    191       const std::size_t count_value = count_;
    192       if (count_value) {
    193         WaitForVariableChange(&count_, count_value, &cond_, &mutex_);
    194       }
    195     }
    196   }
    197 
    198  private:
    199   pthread_cond_t cond_;
    200   pthread_mutex_t mutex_;
    201   std::size_t count_;
    202   std::size_t initial_count_;
    203 };
    204 
    205 // A workload for a worker.
    206 struct Task {
    207   Task() : local_allocator(nullptr) {}
    208   virtual ~Task() {}
    209   virtual void Run() const = 0;
    210   Allocator* local_allocator;
    211 };
    212 
    213 // A worker thread.
    214 class Worker {
    215  public:
    216   enum class State {
    217     ThreadStartup,  // The initial state before the thread main loop runs.
    218     Ready,          // Is not working, has not yet received new work to do.
    219     HasWork,        // Has work to do.
    220     ExitAsSoonAsPossible  // Should exit at earliest convenience.
    221   };
    222 
    223   explicit Worker(BlockingCounter* counter_to_decrement_when_ready)
    224       : task_(nullptr),
    225         state_cond_(PTHREAD_COND_INITIALIZER),
    226         state_mutex_(PTHREAD_MUTEX_INITIALIZER),
    227         state_(State::ThreadStartup),
    228         counter_to_decrement_when_ready_(counter_to_decrement_when_ready) {
    229     pthread_create(&thread_, nullptr, ThreadFunc, this);
    230   }
    231 
    232   ~Worker() {
    233     ChangeState(State::ExitAsSoonAsPossible);
    234     pthread_join(thread_, nullptr);
    235   }
    236 
    237   // Changes State; may be called from either the worker thread
    238   // or the master thread; however, not all state transitions are legal,
    239   // which is guarded by assertions.
    240   void ChangeState(State new_state) {
    241     ScopedProfilingLabel label("Worker::ChangeState");
    242     pthread_mutex_lock(&state_mutex_);
    243     assert(new_state != state_);
    244     switch (state_) {
    245       case State::ThreadStartup:
    246         assert(new_state == State::Ready);
    247         break;
    248       case State::Ready:
    249         assert(new_state == State::HasWork ||
    250                new_state == State::ExitAsSoonAsPossible);
    251         break;
    252       case State::HasWork:
    253         assert(new_state == State::Ready ||
    254                new_state == State::ExitAsSoonAsPossible);
    255         break;
    256       default:
    257         abort();
    258     }
    259     state_ = new_state;
    260     pthread_cond_signal(&state_cond_);
    261     if (state_ == State::Ready) {
    262       counter_to_decrement_when_ready_->DecrementCount();
    263     }
    264     pthread_mutex_unlock(&state_mutex_);
    265   }
    266 
    267   // Thread entry point.
    268   void ThreadFunc() {
    269     ScopedProfilingLabel label("Worker::ThreadFunc");
    270     RegisterCurrentThreadForProfiling();
    271 
    272     ChangeState(State::Ready);
    273 
    274     // Thread main loop
    275     while (true) {
    276       // Get a state to act on
    277       // In the 'Ready' state, we have nothing to do but to wait until
    278       // we switch to another state.
    279       State state_to_act_upon = WaitForVariableChange(
    280           &state_, State::Ready, &state_cond_, &state_mutex_);
    281 
    282       // We now have a state to act on, so act.
    283       switch (state_to_act_upon) {
    284         case State::HasWork:
    285           // Got work to do! So do it, and then revert to 'Ready' state.
    286           ReadBarrier();
    287           assert(task_);
    288           task_->Run();
    289           delete task_;
    290           task_ = nullptr;
    291           ChangeState(State::Ready);
    292           break;
    293         case State::ExitAsSoonAsPossible:
    294           return;
    295         default:
    296           abort();
    297       }
    298     }
    299   }
    300 
    301   static void* ThreadFunc(void* arg) {
    302     static_cast<Worker*>(arg)->ThreadFunc();
    303     return nullptr;
    304   }
    305 
    306   // Called by the master thead to give this worker work to do.
    307   // It is only legal to call this if the worker
    308   void StartWork(Task* task) {
    309     assert(!task_);
    310     task->local_allocator = &local_allocator_;
    311     task_ = task;
    312     WriteBarrier();
    313     assert(state_ == State::Ready);
    314     ChangeState(State::HasWork);
    315   }
    316 
    317  private:
    318   // The underlying thread.
    319   pthread_t thread_;
    320 
    321   // The task to be worked on.
    322   const Task* task_;
    323 
    324   // The condition variable and mutex guarding state changes.
    325   pthread_cond_t state_cond_;
    326   pthread_mutex_t state_mutex_;
    327 
    328   // The state enum tells if we're currently working, waiting for work, etc.
    329   State state_;
    330 
    331   // Each thread had a local allocator so they can allocate temporary
    332   // buffers without blocking each other.
    333   Allocator local_allocator_;
    334 
    335   // pointer to the master's thread BlockingCounter object, to notify the
    336   // master thread of when this worker switches to the 'Ready' state.
    337   BlockingCounter* const counter_to_decrement_when_ready_;
    338 };
    339 
    340 // A very simple pool of workers, that only allows the very
    341 // specific parallelization pattern that we use here:
    342 // a fixed number of workers can be given work, and one then
    343 // waits for all of them to finish.
    344 class WorkersPool {
    345  public:
    346   WorkersPool() {}
    347 
    348   ~WorkersPool() {
    349     for (auto w : workers_) {
    350       delete w;
    351     }
    352   }
    353 
    354   BlockingCounter& counter_to_decrement_when_ready() {
    355     return counter_to_decrement_when_ready_;
    356   }
    357 
    358   // Give work to a specific worker.
    359   void StartWorker(int index, Task* task_) {
    360     assert(static_cast<std::size_t>(index) < workers_.size());
    361     workers_[index]->StartWork(task_);
    362   }
    363 
    364   // Ensures that the pool has at least the given count of workers.
    365   // If any new worker has to be created, this function waits for it to
    366   // be ready.
    367   void CreateWorkers(std::size_t workers_count) {
    368     if (workers_.size() >= workers_count) {
    369       return;
    370     }
    371     counter_to_decrement_when_ready_.Reset(workers_count - workers_.size());
    372     while (workers_.size() < workers_count) {
    373       workers_.push_back(new Worker(&counter_to_decrement_when_ready_));
    374     }
    375     counter_to_decrement_when_ready_.Wait();
    376   }
    377 
    378  private:
    379   // copy construction disallowed
    380   WorkersPool(const WorkersPool&) = delete;
    381 
    382   // The workers in this pool. They are owned by the pool:
    383   // the pool creates workers and destroys them in its destructor.
    384   std::vector<Worker*> workers_;
    385 
    386   // The BlockingCounter used to wait for the workers.
    387   BlockingCounter counter_to_decrement_when_ready_;
    388 };
    389 
    390 // The task we use to implement a multi-threaded Gemm: a block of the
    391 // RHS has been packed by the master thread; each worker thread
    392 // then has to pack a block of the LHS and accumulate the Gemm of these
    393 // packed LHS and RHS blocks.
    394 template <typename KernelFormat, typename InputScalar, typename OutputScalar,
    395           typename BitDepthParams, MapOrder LhsOrder, MapOrder RhsOrder,
    396           MapOrder ResultOrder, typename LhsOffset, typename RhsOffset,
    397           typename OutputPipelineType>
    398 struct GemmWithPackedRhsTask : Task {
    399   typedef PackedSideBlock<typename KernelFormat::Lhs> PackedLhs;
    400   typedef PackedSideBlock<typename KernelFormat::Rhs> PackedRhs;
    401   GemmWithPackedRhsTask(const KernelBase& _kernel,
    402                         const MatrixMap<const InputScalar, LhsOrder>& _lhs,
    403                         const PackedRhs& _packed_rhs,
    404                         MatrixMap<OutputScalar, ResultOrder>* _result,
    405                         const LhsOffset& _lhs_offset,
    406                         const RhsOffset& _rhs_offset,
    407                         const OutputPipelineType& _output_pipeline)
    408       : kernel(_kernel),
    409         lhs(_lhs),
    410         packed_rhs(_packed_rhs),
    411         result(*_result),
    412         lhs_offset(_lhs_offset),
    413         rhs_offset(_rhs_offset),
    414         output_pipeline(_output_pipeline) {}
    415 
    416   void Run() const override {
    417     ScopedProfilingLabel label("GemmWithPackedRhsTask");
    418 
    419     const int rows = result.rows();
    420     const int cols = result.cols();
    421     const int depth = lhs.cols();
    422 
    423     BlockParams block_params;
    424     block_params.Init<KernelFormat>(rows, cols, depth, 1);
    425 
    426     PackedLhs packed_lhs(Side::Lhs, local_allocator, block_params);
    427 
    428     PackedResult packed_result(local_allocator, block_params);
    429 
    430     local_allocator->Commit();
    431 
    432     for (int c = 0; c < cols; c += block_params.l2_cols) {
    433       int cs = std::min(block_params.l2_cols, cols - c);
    434 
    435       for (int r = 0; r < rows; r += block_params.l2_rows) {
    436         int rs = std::min(block_params.l2_rows, rows - r);
    437 
    438         PackLhs<BitDepthParams>(&packed_lhs, lhs.block(r, 0, rs, depth));
    439 
    440         Compute(kernel, block_params, &packed_result, packed_lhs, packed_rhs);
    441 
    442         auto result_block = result.block(r, c, rs, cs);
    443         UnpackResult<BitDepthParams>(&result_block, packed_result, depth,
    444                                      packed_lhs.sums_of_each_slice(),
    445                                      packed_rhs.sums_of_each_slice(),
    446                                      lhs_offset, rhs_offset, output_pipeline);
    447       }
    448     }
    449 
    450     local_allocator->Decommit();
    451   }
    452 
    453   const KernelBase& kernel;
    454   const MatrixMap<const InputScalar, LhsOrder> lhs;
    455   const PackedRhs packed_rhs;
    456   MatrixMap<OutputScalar, ResultOrder> result;
    457   const LhsOffset& lhs_offset;
    458   const RhsOffset& rhs_offset;
    459   const OutputPipelineType& output_pipeline;
    460 };
    461 
    462 class MultiThreadGemmContext : public SingleThreadGemmContext {
    463  public:
    464   MultiThreadGemmContext() : max_num_threads_(0) {}
    465 
    466   void set_max_num_threads(int n) { max_num_threads_ = n; }
    467 
    468   int max_num_threads() const { return max_num_threads_; }
    469 
    470   WorkersPool* workers_pool() { return &workers_pool_; }
    471 
    472   Allocator* main_thread_task_allocator() {
    473     return &main_thread_task_allocator_;
    474   }
    475 
    476  protected:
    477   // The workers pool used by MultiThreadGemm. Making
    478   // this part of the context allows it to be persistent,
    479   // avoiding recreating threads on every Gemm.
    480   WorkersPool workers_pool_;
    481 
    482   // The maximum number of worker threads to use (in addition
    483   // to the master thread).
    484   // The default value 0 means the default behavior of
    485   // detecting the number of hardware threads. Nonzero values mean
    486   // skipping and overriding hardware detection.
    487   int max_num_threads_;
    488 
    489   // For N-threaded operations, we will use only N-1 worker threads
    490   // while the last task will be run directly on the main thread.
    491   // It will then use this main_thread_task_allocator_; having a
    492   // dedicated allocator for that (separate from the base allocator_)
    493   // allows to use the same code for all tasks regardless of which
    494   // thread they run on.
    495   Allocator main_thread_task_allocator_;
    496 };
    497 
    498 // Determines how many threads should be used for a given Gemm
    499 // operation.
    500 template <int KernelRows>
    501 inline int HowManyThreads(MultiThreadGemmContext* context, int rows, int cols,
    502                           int depth) {
    503   // First check if the user set an explicit maximum number of threads.
    504   int max_count = context->max_num_threads();
    505   if (!max_count) {
    506     // No user-set maximum number of threads, so we need to
    507     // do some hardware detection.
    508     // This is expensive to query so we do it only once.
    509     // Too bad for dynamicness. Also, we dont use the c++11 standard getter
    510     // because Google's coding style currently bans #include <thread_>.
    511     static const int hardware_threads_count =
    512         static_cast<int>(sysconf(_SC_NPROCESSORS_CONF));
    513 
    514     max_count = hardware_threads_count;
    515   }
    516 
    517   // Basic calculation: take into account max pool size, and
    518   // how many rows we have to feed our kernel.
    519   // The motivation for an absolute minimum number of rows per thread,
    520   // potentially higher than KernelRows, is that very thin thread workload
    521   // currently defeat assumptions of the AddMod generator, resulting
    522   // in substantial bias in TestWithRealData on 24 threads.
    523   // Ideally, the AddMod generator should be aware of global (r,c) coordinates
    524   // so as to be independent of the number of threads.
    525   static const int AbsoluteMinRowsPerThread = 16;
    526   static const int MinRowsPerThread = KernelRows > AbsoluteMinRowsPerThread
    527                                           ? KernelRows
    528                                           : AbsoluteMinRowsPerThread;
    529   int thread_count = std::min(max_count, CeilQuotient(rows, MinRowsPerThread));
    530 
    531   // At this point for small products we already have thread_count==1 so
    532   // we can avoid doing more work; otherwise, we still want to check
    533   // that the cubic size (rows*cols*depth) is big enough to keep
    534   // workers_ busy.
    535   if (thread_count > 1) {
    536     // Empirically determined value.
    537     static const std::uint64_t min_cubic_size_per_thread = 64 * 1024;
    538 
    539     // We can only multiply two out of three sizes without risking overflow
    540     const std::uint64_t cubic_size =
    541         std::uint64_t(rows) * std::uint64_t(cols) * std::uint64_t(depth);
    542 
    543     thread_count =
    544         std::min(thread_count, int(cubic_size / min_cubic_size_per_thread));
    545 
    546     if (thread_count < 1) {
    547       thread_count = 1;
    548     }
    549   }
    550 
    551   assert(thread_count > 0 && thread_count <= max_count);
    552   return thread_count;
    553 }
    554 
    555 // The main multi-threaded Gemm function.
    556 // To understand it, first read the code of SingleThreadedGemm().
    557 // The parallelization scheme used here is to have this master function
    558 // pack a block of RHS and then start worker threads to pack a block of LHS
    559 // each, and accumulate the corresponding products.
    560 template <typename KernelFormat, typename InputScalar, typename OutputScalar,
    561           typename BitDepthParams, MapOrder LhsOrder, MapOrder RhsOrder,
    562           MapOrder ResultOrder, typename LhsOffset, typename RhsOffset,
    563           typename OutputPipelineType>
    564 void MultiThreadGemm(MultiThreadGemmContext* context, const KernelBase& kernel,
    565                      const MatrixMap<const InputScalar, LhsOrder>& lhs,
    566                      const MatrixMap<const InputScalar, RhsOrder>& rhs,
    567                      MatrixMap<OutputScalar, ResultOrder>* result,
    568                      const LhsOffset& lhs_offset, const RhsOffset& rhs_offset,
    569                      const OutputPipelineType& output_pipeline) {
    570   ScopedProfilingLabel label("gemmlowp::MultiThreadGemm");
    571 
    572   assert(lhs.cols() == rhs.rows());
    573 
    574   int rows = result->rows();
    575   int cols = result->cols();
    576   int depth = lhs.cols();
    577 
    578   assert(rows > 0);
    579   assert(cols > 0);
    580   assert(depth > 0);
    581 
    582   const int thread_count =
    583       HowManyThreads<KernelFormat::kRows>(context, rows, cols, depth);
    584   if (thread_count == 1) {
    585     return SingleThreadGemm<KernelFormat, InputScalar, OutputScalar,
    586                             BitDepthParams>(context, kernel, lhs, rhs, result,
    587                                             lhs_offset, rhs_offset,
    588                                             output_pipeline);
    589   }
    590   assert(thread_count > 1);
    591 
    592   // We choose to use a worker thread for all but one
    593   // of the thread workloads. The remaining thread workload will be
    594   // executed immediately on the current thread.
    595   // In this way, the total number of threads (1 master, N-1 workers)
    596   // equals the value returned by HowManyThread. This simple
    597   // 1:1 mapping of threads to physical cores, is very important
    598   // to getting good multithreaded performance especially for
    599   // not-very-large GEMMs, and especially on Android.
    600   const int workers_count = thread_count - 1;
    601 
    602   Allocator* allocator = context->allocator();
    603   WorkersPool* workers_pool = context->workers_pool();
    604 
    605   workers_pool->CreateWorkers(workers_count);
    606 
    607   BlockParams block_params;
    608   block_params.Init<KernelFormat>(rows, cols, depth, workers_count);
    609 
    610   PackedSideBlock<typename KernelFormat::Rhs> packed_rhs(
    611       Side::Rhs, allocator, block_params);
    612   allocator->Commit();
    613 
    614   // We loop over large blocks of the RHS.
    615   for (int c = 0; c < cols; c += block_params.l2_cols) {
    616     int cs = std::min(block_params.l2_cols, cols - c);
    617 
    618     // Pack a large block of the RHS.
    619     PackRhs<BitDepthParams>(&packed_rhs, rhs.block(0, c, depth, cs));
    620 
    621     // Give work to each worker.
    622     int next_start_row = 0;
    623     workers_pool->counter_to_decrement_when_ready().Reset(workers_count);
    624     for (int thread = 0; thread < thread_count; thread++) {
    625       int start_row = next_start_row;
    626       next_start_row = std::min(rows, RoundUp<KernelFormat::kRows>(
    627                                           rows * (thread + 1) / thread_count));
    628 
    629       int block_rows = next_start_row - start_row;
    630       auto lhs_block = lhs.block(start_row, 0, block_rows, depth);
    631       auto result_block = result->block(start_row, c, block_rows, cs);
    632       typedef GemmWithPackedRhsTask<KernelFormat, InputScalar, OutputScalar,
    633                                     BitDepthParams, LhsOrder, RhsOrder,
    634                                     ResultOrder, LhsOffset, RhsOffset,
    635                                     OutputPipelineType>
    636           TaskType;
    637       auto task = new TaskType(kernel, lhs_block, packed_rhs, &result_block,
    638                                lhs_offset, rhs_offset, output_pipeline);
    639       if (thread < workers_count) {
    640         workers_pool->StartWorker(thread, task);
    641       } else {
    642         // Execute the remaining workload immediately on the current thread.
    643         task->local_allocator = context->main_thread_task_allocator();
    644         task->Run();
    645         delete task;
    646       }
    647     }
    648     // Wait for the workers.
    649     workers_pool->counter_to_decrement_when_ready().Wait();
    650   }
    651 
    652   allocator->Decommit();
    653 }
    654 
    655 }  // namespace gemmlowp
    656 
    657 #endif  // GEMMLOWP_INTERNAL_MULTI_THREAD_GEMM_H_
    658