1 // Copyright 2015 The Gemmlowp Authors. All Rights Reserved. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 // multi_thread_gemm.h: Multi-threaded GEMM entry point. 16 // Readers note: To understand this file, it is useful to first 17 // read and understand the much simpler single_thread_gemm.h. 18 19 #ifndef GEMMLOWP_INTERNAL_MULTI_THREAD_GEMM_H_ 20 #define GEMMLOWP_INTERNAL_MULTI_THREAD_GEMM_H_ 21 22 #include <pthread.h> 23 #include <unistd.h> 24 #include <vector> 25 26 #include "single_thread_gemm.h" 27 28 namespace gemmlowp { 29 30 // On X86 and ARM platforms we enable a busy-wait spinlock before waiting on a 31 // pthread conditional variable. In order to implement that correctly we need 32 // to put some explicit memory load/store barriers. 33 34 #if defined(GEMMLOWP_ALLOW_INLINE_ASM) && !defined(GEMMLOWP_NO_BUSYWAIT) && \ 35 (defined(GEMMLOWP_ARM) || defined(GEMMLOWP_X86)) 36 37 #define GEMMLOWP_USE_BUSYWAIT 38 39 const int kMaxBusyWaitNOPs = 32 * 1000 * 1000; 40 41 #define GEMMLOWP_NOP "nop\n" 42 43 #define GEMMLOWP_STRING_CONCAT_4(X) X X X X 44 #define GEMMLOWP_NOP4 GEMMLOWP_STRING_CONCAT_4(GEMMLOWP_NOP) 45 #define GEMMLOWP_NOP16 GEMMLOWP_STRING_CONCAT_4(GEMMLOWP_NOP4) 46 #define GEMMLOWP_NOP64 GEMMLOWP_STRING_CONCAT_4(GEMMLOWP_NOP16) 47 48 inline int Do256NOPs() { 49 asm volatile(GEMMLOWP_NOP64); 50 return 64; 51 } 52 53 #undef GEMMLOWP_STRING_CONCAT_4 54 #undef GEMMLOWP_NOP256 55 #undef GEMMLOWP_NOP64 56 #undef GEMMLOWP_NOP16 57 #undef GEMMLOWP_NOP4 58 #undef GEMMLOWP_NOP 59 60 inline void WriteBarrier() { 61 #ifdef GEMMLOWP_ARM_32 62 MemoryBarrier(); 63 #elif defined(GEMMLOWP_ARM_64) 64 asm volatile("dmb ishst" ::: "memory"); 65 #elif defined(GEMMLOWP_X86) 66 asm volatile("sfence" ::: "memory"); 67 #else 68 #error "Unsupported architecture for WriteBarrier." 69 #endif 70 } 71 72 inline void ReadBarrier() { 73 #ifdef GEMMLOWP_ARM_32 74 MemoryBarrier(); 75 #elif defined(GEMMLOWP_ARM_64) 76 asm volatile("dmb ishld" ::: "memory"); 77 #elif defined(GEMMLOWP_X86) 78 asm volatile("lfence" ::: "memory"); 79 #else 80 #error "Unsupported architecture for ReadBarrier." 81 #endif 82 } 83 84 #endif 85 86 // Waits until *var != initial_value. 87 // 88 // Returns the new value of *var. The guarantee here is that 89 // the return value is different from initial_value, and that that 90 // new value has been taken by *var at some point during the 91 // execution of this function. There is no guarantee that this is 92 // still the value of *var when this function returns, since *var is 93 // not assumed to be guarded by any lock. 94 // 95 // First does some busy-waiting for a fixed number of no-op cycles, 96 // then falls back to passive waiting for the given condvar, guarded 97 // by the given mutex. 98 // 99 // The idea of doing some initial busy-waiting is to help get 100 // better and more consistent multithreading benefits for small GEMM sizes. 101 // Busy-waiting help ensuring that if we need to wake up soon after having 102 // started waiting, then we can wake up quickly (as opposed to, say, 103 // having to wait to be scheduled again by the OS). On the other hand, 104 // we must still eventually revert to passive waiting for longer waits 105 // (e.g. worker threads having finished a GEMM and waiting until the next GEMM) 106 // so as to avoid permanently spinning. 107 // 108 template <typename T> 109 T WaitForVariableChange(volatile T* var, T initial_value, pthread_cond_t* cond, 110 pthread_mutex_t* mutex) { 111 #ifdef GEMMLOWP_USE_BUSYWAIT 112 // If we are on a platform that supports it, spin for some time. 113 { 114 int nops = 0; 115 // First, trivial case where the variable already changed value. 116 T new_value = *var; 117 if (new_value != initial_value) { 118 ReadBarrier(); 119 return new_value; 120 } 121 // Then try busy-waiting. 122 while (nops < kMaxBusyWaitNOPs) { 123 nops += Do256NOPs(); 124 new_value = *var; 125 if (new_value != initial_value) { 126 ReadBarrier(); 127 return new_value; 128 } 129 } 130 } 131 #endif 132 133 // Finally, do real passive waiting. 134 pthread_mutex_lock(mutex); 135 T new_value = *var; 136 if (new_value == initial_value) { 137 pthread_cond_wait(cond, mutex); 138 new_value = *var; 139 assert(new_value != initial_value); 140 } 141 pthread_mutex_unlock(mutex); 142 return new_value; 143 } 144 145 // A BlockingCounter lets one thread to wait for N events to occur. 146 // This is how the master thread waits for all the worker threads 147 // to have finished working. 148 class BlockingCounter { 149 public: 150 BlockingCounter() 151 : cond_(PTHREAD_COND_INITIALIZER), 152 mutex_(PTHREAD_MUTEX_INITIALIZER), 153 count_(0), 154 initial_count_(0) {} 155 156 // Sets/resets the counter; initial_count is the number of 157 // decrementing events that the Wait() call will be waiting for. 158 void Reset(std::size_t initial_count) { 159 pthread_mutex_lock(&mutex_); 160 assert(count_ == 0); 161 initial_count_ = initial_count; 162 count_ = initial_count_; 163 pthread_mutex_unlock(&mutex_); 164 } 165 166 // Decrements the counter; if the counter hits zero, signals 167 // the thread that was waiting for that, and returns true. 168 // Otherwise (if the decremented count is still nonzero), 169 // returns false. 170 bool DecrementCount() { 171 pthread_mutex_lock(&mutex_); 172 assert(count_ > 0); 173 count_--; 174 #ifdef GEMMLOWP_USE_BUSYWAIT 175 WriteBarrier(); 176 #endif 177 if (count_ == 0) { 178 pthread_cond_signal(&cond_); 179 } 180 bool retval = count_ == 0; 181 pthread_mutex_unlock(&mutex_); 182 return retval; 183 } 184 185 // Waits for the N other threads (N having been set by Reset()) 186 // to hit the BlockingCounter. 187 void Wait() { 188 ScopedProfilingLabel label("BlockingCounter::Wait"); 189 while (count_) { 190 MemoryBarrier(); 191 const std::size_t count_value = count_; 192 if (count_value) { 193 WaitForVariableChange(&count_, count_value, &cond_, &mutex_); 194 } 195 } 196 } 197 198 private: 199 pthread_cond_t cond_; 200 pthread_mutex_t mutex_; 201 std::size_t count_; 202 std::size_t initial_count_; 203 }; 204 205 // A workload for a worker. 206 struct Task { 207 Task() : local_allocator(nullptr) {} 208 virtual ~Task() {} 209 virtual void Run() = 0; 210 Allocator* local_allocator; 211 }; 212 213 // A worker thread. 214 class Worker { 215 public: 216 enum class State { 217 ThreadStartup, // The initial state before the thread main loop runs. 218 Ready, // Is not working, has not yet received new work to do. 219 HasWork, // Has work to do. 220 ExitAsSoonAsPossible // Should exit at earliest convenience. 221 }; 222 223 explicit Worker(BlockingCounter* counter_to_decrement_when_ready) 224 : task_(nullptr), 225 state_cond_(PTHREAD_COND_INITIALIZER), 226 state_mutex_(PTHREAD_MUTEX_INITIALIZER), 227 state_(State::ThreadStartup), 228 counter_to_decrement_when_ready_(counter_to_decrement_when_ready) { 229 pthread_create(&thread_, nullptr, ThreadFunc, this); 230 } 231 232 ~Worker() { 233 ChangeState(State::ExitAsSoonAsPossible); 234 pthread_join(thread_, nullptr); 235 } 236 237 // Changes State; may be called from either the worker thread 238 // or the master thread; however, not all state transitions are legal, 239 // which is guarded by assertions. 240 void ChangeState(State new_state) { 241 ScopedProfilingLabel label("Worker::ChangeState"); 242 pthread_mutex_lock(&state_mutex_); 243 assert(new_state != state_); 244 switch (state_) { 245 case State::ThreadStartup: 246 assert(new_state == State::Ready); 247 break; 248 case State::Ready: 249 assert(new_state == State::HasWork || 250 new_state == State::ExitAsSoonAsPossible); 251 break; 252 case State::HasWork: 253 assert(new_state == State::Ready || 254 new_state == State::ExitAsSoonAsPossible); 255 break; 256 default: 257 abort(); 258 } 259 state_ = new_state; 260 pthread_cond_signal(&state_cond_); 261 if (state_ == State::Ready) { 262 counter_to_decrement_when_ready_->DecrementCount(); 263 } 264 pthread_mutex_unlock(&state_mutex_); 265 } 266 267 // Thread entry point. 268 void ThreadFunc() { 269 ScopedProfilingLabel label("Worker::ThreadFunc"); 270 RegisterCurrentThreadForProfiling(); 271 272 ChangeState(State::Ready); 273 274 // Thread main loop 275 while (true) { 276 // Get a state to act on 277 // In the 'Ready' state, we have nothing to do but to wait until 278 // we switch to another state. 279 State state_to_act_upon = WaitForVariableChange( 280 &state_, State::Ready, &state_cond_, &state_mutex_); 281 282 // We now have a state to act on, so act. 283 switch (state_to_act_upon) { 284 case State::HasWork: 285 // Got work to do! So do it, and then revert to 'Ready' state. 286 assert(task_); 287 task_->Run(); 288 task_ = nullptr; 289 ChangeState(State::Ready); 290 break; 291 case State::ExitAsSoonAsPossible: 292 return; 293 default: 294 abort(); 295 } 296 } 297 } 298 299 static void* ThreadFunc(void* arg) { 300 static_cast<Worker*>(arg)->ThreadFunc(); 301 return nullptr; 302 } 303 304 // Called by the master thead to give this worker work to do. 305 // It is only legal to call this if the worker 306 void StartWork(Task* task) { 307 assert(!task_); 308 task->local_allocator = &local_allocator_; 309 task_ = task; 310 #ifdef GEMMLOWP_USE_BUSYWAIT 311 WriteBarrier(); 312 #endif 313 assert(state_ == State::Ready); 314 ChangeState(State::HasWork); 315 } 316 317 private: 318 // The underlying thread. 319 pthread_t thread_; 320 321 // The task to be worked on. 322 Task* task_; 323 324 // The condition variable and mutex guarding state changes. 325 pthread_cond_t state_cond_; 326 pthread_mutex_t state_mutex_; 327 328 // The state enum tells if we're currently working, waiting for work, etc. 329 State state_; 330 331 // Each thread had a local allocator so they can allocate temporary 332 // buffers without blocking each other. 333 Allocator local_allocator_; 334 335 // pointer to the master's thread BlockingCounter object, to notify the 336 // master thread of when this worker switches to the 'Ready' state. 337 BlockingCounter* const counter_to_decrement_when_ready_; 338 }; 339 340 // A very simple pool of workers, that only allows the very 341 // specific parallelization pattern that we use here: 342 // a fixed number of workers can be given work, and one then 343 // waits for all of them to finish. 344 // 345 // See MultiThreadGemmContextBase for how other WorkersPool implementations can 346 // be used. Note that in those implementations, StartWorker can be free to 347 // ignore the <index> value; that is, the caller of WorkersPool does not rely on 348 // <index> to order tasks with equal <index>. 349 class WorkersPool { 350 public: 351 WorkersPool() {} 352 353 ~WorkersPool() { 354 for (auto w : workers_) { 355 delete w; 356 } 357 } 358 359 void Execute(const std::vector<Task*>& tasks) { 360 assert(tasks.size() >= 1); 361 // One of the tasks will be run on the current thread. 362 int workers_count = tasks.size() - 1; 363 CreateWorkers(workers_count); 364 assert(workers_count <= workers_.size()); 365 counter_to_decrement_when_ready_.Reset(workers_count); 366 int n = 0; 367 std::for_each(tasks.begin(), --tasks.end(), [this, &n](Task *task) { 368 workers_[n++]->StartWork(task); 369 }); 370 // Execute the remaining workload immediately on the current thread. 371 Task* task = tasks.back(); 372 task->local_allocator = &main_thread_task_allocator_; 373 task->Run(); 374 // Wait for the workers submitted above to finish. 375 counter_to_decrement_when_ready_.Wait(); 376 // Cleanup tasks (best to do this from the same thread that allocated 377 // the memory). 378 std::for_each(tasks.begin(), tasks.end(), [](Task *task) { 379 delete task; 380 }); 381 } 382 383 private: 384 // Ensures that the pool has at least the given count of workers. 385 // If any new worker has to be created, this function waits for it to 386 // be ready. 387 void CreateWorkers(std::size_t workers_count) { 388 if (workers_.size() >= workers_count) { 389 return; 390 } 391 counter_to_decrement_when_ready_.Reset(workers_count - workers_.size()); 392 while (workers_.size() < workers_count) { 393 workers_.push_back(new Worker(&counter_to_decrement_when_ready_)); 394 } 395 counter_to_decrement_when_ready_.Wait(); 396 } 397 398 // copy construction disallowed 399 WorkersPool(const WorkersPool&) = delete; 400 401 // The workers in this pool. They are owned by the pool: 402 // the pool creates workers and destroys them in its destructor. 403 std::vector<Worker*> workers_; 404 405 // The BlockingCounter used to wait for the workers. 406 BlockingCounter counter_to_decrement_when_ready_; 407 408 // For N-threaded operations, we will use only N-1 worker threads 409 // while the last task will be run directly on the main thread. 410 // It will then use this main_thread_task_allocator_; having a 411 // dedicated allocator for that (separate from the base allocator_) 412 // allows to use the same code for all tasks regardless of which 413 // thread they run on. 414 Allocator main_thread_task_allocator_; 415 }; 416 417 // The task we use to implement a multi-threaded Gemm: a block of the 418 // RHS has been packed by the master thread; each worker thread 419 // then has to pack a block of the LHS and accumulate the Gemm of these 420 // packed LHS and RHS blocks. 421 template <typename KernelFormat, typename InputScalar, typename OutputScalar, 422 typename BitDepthParams, MapOrder LhsOrder, MapOrder RhsOrder, 423 MapOrder ResultOrder, typename LhsOffset, typename RhsOffset, 424 typename OutputPipelineType, typename GemmContextType> 425 struct GemmWithPackedRhsTask : Task { 426 typedef PackedSideBlock<typename KernelFormat::Lhs> PackedLhs; 427 typedef PackedSideBlock<typename KernelFormat::Rhs> PackedRhs; 428 GemmWithPackedRhsTask(GemmContextType* _context, 429 const KernelBase& _kernel, 430 const MatrixMap<const InputScalar, LhsOrder>& _lhs, 431 const PackedRhs& _packed_rhs, 432 MatrixMap<OutputScalar, ResultOrder>* _result, 433 const MatrixBlockBounds& _result_block, 434 const LhsOffset& _lhs_offset, 435 const RhsOffset& _rhs_offset, 436 const OutputPipelineType& _output_pipeline) 437 : context(_context), 438 kernel(_kernel), 439 lhs(_lhs), 440 packed_rhs(_packed_rhs), 441 result(*_result), 442 result_block(_result_block), 443 lhs_offset(_lhs_offset), 444 rhs_offset(_rhs_offset), 445 output_pipeline(_output_pipeline) {} 446 447 void Run() override { 448 ScopedProfilingLabel label("GemmWithPackedRhsTask"); 449 450 const int rows = result_block.rows; 451 const int cols = result_block.cols; 452 const int depth = lhs.cols(); 453 454 BlockParams block_params; 455 block_params.Init<KernelFormat>(rows, cols, depth, 1, 456 context->l1_bytes_to_use(), 457 context->l2_bytes_to_use(), 458 context->l2_rhs_factor()); 459 460 PackedLhs packed_lhs(Side::Lhs, local_allocator, block_params); 461 462 PackedResult packed_result(local_allocator, block_params); 463 464 local_allocator->Commit(); 465 466 for (int c = 0; c < cols; c += block_params.l2_cols) { 467 int cs = std::min(block_params.l2_cols, cols - c); 468 469 for (int r = 0; r < rows; r += block_params.l2_rows) { 470 int rs = std::min(block_params.l2_rows, rows - r); 471 472 PackLhs(&packed_lhs, lhs.block(r, 0, rs, depth)); 473 474 Compute(kernel, block_params, &packed_result, packed_lhs, packed_rhs, 475 depth); 476 477 auto curr_result_block = MatrixBlockBounds( 478 result_block.start_row + r, result_block.start_col + c, rs, cs); 479 UnpackResult<KernelFormat>( 480 &result, curr_result_block, packed_result, depth, 481 packed_lhs.sums_of_each_slice(), packed_rhs.sums_of_each_slice(), 482 lhs_offset.block(curr_result_block.start_row, rs), 483 rhs_offset.block(curr_result_block.start_col, cs), output_pipeline); 484 } 485 } 486 487 local_allocator->Decommit(); 488 } 489 490 const GemmContextType* context; 491 const KernelBase& kernel; 492 const MatrixMap<const InputScalar, LhsOrder> lhs; 493 const PackedRhs packed_rhs; 494 MatrixMap<OutputScalar, ResultOrder> result; 495 const MatrixBlockBounds result_block; 496 const LhsOffset& lhs_offset; 497 const RhsOffset& rhs_offset; 498 const OutputPipelineType& output_pipeline; 499 }; 500 501 // This base class for multi-threading allows subclasses to implement their own 502 // workers_pool() method. See MultiThreadGemmContext below for an example; 503 // any other implementation of workers_pool() must return an object with the 504 // same public methods as WorkersPool. 505 class MultiThreadGemmContextBase : public SingleThreadGemmContext { 506 public: 507 void set_max_num_threads(int n) { max_num_threads_ = n; } 508 509 int max_num_threads() const { return max_num_threads_; } 510 511 protected: 512 // The maximum number of worker threads to use (including 513 // the master thread). 514 // The default value 1 means single-threading. That is the default 515 // because gemmlowp's primary target is mobile hardware, where thermal 516 // constraints usually mean that it may not be realistic to use more 517 // than 1 CPU core even if multiple cores are present. 518 // The special value 0 means try to detect the number of hardware threads. 519 // Note: this assumes that all CPU cores are equivalent. That assumption 520 // is defeated on big.LITTLE ARM devices, where we have no API to query 521 // the number of big cores (which is typically what we would want to use, 522 // leaving aside above-mentioned thermal issues). That is the other reason 523 // why the best compromise here is to let max_num_threads_ default to 1, 524 // so users who want multi-threading have to make the decision of how many 525 // threads to use by themselves. 526 int max_num_threads_ = 1; 527 }; 528 529 class MultiThreadGemmContext : public MultiThreadGemmContextBase { 530 public: 531 WorkersPool* workers_pool() { return &workers_pool_; } 532 533 private: 534 // The workers pool used by MultiThreadGemm. Making 535 // this part of the context allows it to be persistent, 536 // avoiding recreating threads on every Gemm. 537 WorkersPool workers_pool_; 538 }; 539 540 // Needed by chrome native builds 541 #ifndef _SC_NPROCESSORS_CONF 542 #define _SC_NPROCESSORS_CONF _SC_NPROCESSORS_ONLN 543 #endif 544 545 // Determines how many threads should be used for a given Gemm 546 // operation. 547 template <int KernelRows> 548 inline int HowManyThreads(int max_num_threads, int rows, int cols, int depth) { 549 // Early-exit in the default case where multi-threading is disabled. 550 if (max_num_threads == 1) { 551 return 1; 552 } 553 554 // Determine the maximum number of threads. 555 int max_count = max_num_threads; 556 // The special value 0 means try to determine the total number of cores. 557 if (max_count == 0) { 558 // No user-set maximum number of threads, so we need to 559 // do some hardware detection. 560 // This is expensive to query so we do it only once. 561 // Too bad for dynamicness. Also, we dont use the c++11 standard getter 562 // because Google's coding style currently bans #include <thread_>. 563 static const int hardware_threads_count = 564 static_cast<int>(sysconf(_SC_NPROCESSORS_CONF)); 565 566 max_count = hardware_threads_count; 567 } 568 569 // Basic calculation: take into account max pool size, and 570 // how many rows we have to feed our kernel. 571 // The motivation for an absolute minimum number of rows per thread, 572 // potentially higher than KernelRows, is that very thin thread workload 573 // currently defeat assumptions of the AddMod generator, resulting 574 // in substantial bias in TestWithRealData on 24 threads. 575 // Ideally, the AddMod generator should be aware of global (r,c) coordinates 576 // so as to be independent of the number of threads. 577 static const int AbsoluteMinRowsPerThread = 16; 578 static const int MinRowsPerThread = KernelRows > AbsoluteMinRowsPerThread 579 ? KernelRows 580 : AbsoluteMinRowsPerThread; 581 int thread_count = std::min(max_count, CeilQuotient(rows, MinRowsPerThread)); 582 583 // At this point for small products we already have thread_count==1 so 584 // we can avoid doing more work; otherwise, we still want to check 585 // that the cubic size (rows*cols*depth) is big enough to keep 586 // workers_ busy. 587 if (thread_count > 1) { 588 // Empirically determined value. 589 static const std::uint64_t min_cubic_size_per_thread = 64 * 1024; 590 591 // We can only multiply two out of three sizes without risking overflow 592 const std::uint64_t cubic_size = 593 std::uint64_t(rows) * std::uint64_t(cols) * std::uint64_t(depth); 594 595 thread_count = 596 std::min(thread_count, int(cubic_size / min_cubic_size_per_thread)); 597 598 if (thread_count < 1) { 599 thread_count = 1; 600 } 601 } 602 603 assert(thread_count > 0 && thread_count <= max_count); 604 return thread_count; 605 } 606 607 // The main multi-threaded Gemm function. 608 // To understand it, first read the code of SingleThreadGemm(). 609 // The parallelization scheme used here is to have this master function 610 // pack a block of RHS and then start worker threads to pack a block of LHS 611 // each, and accumulate the corresponding products. 612 template <typename KernelFormat, typename InputScalar, typename OutputScalar, 613 typename BitDepthParams, MapOrder LhsOrder, MapOrder RhsOrder, 614 MapOrder ResultOrder, typename LhsOffset, typename RhsOffset, 615 typename OutputPipelineType, typename GemmContextType> 616 void MultiThreadGemm(GemmContextType* context, const KernelBase& kernel, 617 const MatrixMap<const InputScalar, LhsOrder>& lhs, 618 const MatrixMap<const InputScalar, RhsOrder>& rhs, 619 MatrixMap<OutputScalar, ResultOrder>* result, 620 const LhsOffset& lhs_offset, const RhsOffset& rhs_offset, 621 const OutputPipelineType& output_pipeline) { 622 ScopedProfilingLabel label("gemmlowp::MultiThreadGemm"); 623 624 assert(lhs.cols() == rhs.rows()); 625 626 int rows = result->rows(); 627 int cols = result->cols(); 628 int depth = lhs.cols(); 629 630 // zero sizes should have been caught earlier and early-returned. 631 assert(rows > 0); 632 assert(cols > 0); 633 assert(depth > 0); 634 635 // The case of rows<cols should have been caught earlier and transposed. 636 assert(rows >= cols); 637 638 const int thread_count = HowManyThreads<KernelFormat::kRows>( 639 context->max_num_threads(), rows, cols, depth); 640 if (thread_count == 1) { 641 return SingleThreadGemm<KernelFormat, InputScalar, OutputScalar, 642 BitDepthParams>(context, kernel, lhs, rhs, result, 643 lhs_offset, rhs_offset, 644 output_pipeline); 645 } 646 assert(thread_count > 1); 647 648 // Simple 1:1 mapping of tasks to physical cores, which is very important 649 // to getting good multithreaded performance, specially for not-very-large 650 // GEMMs, and especially on Android. 651 const int task_count = thread_count; 652 653 Allocator* allocator = context->allocator(); 654 auto* workers_pool = context->workers_pool(); 655 656 BlockParams block_params; 657 block_params.Init<KernelFormat>(rows, cols, depth, task_count, 658 context->l1_bytes_to_use(), 659 context->l2_bytes_to_use(), 660 context->l2_rhs_factor()); 661 662 PackedSideBlock<typename KernelFormat::Rhs> packed_rhs(Side::Rhs, allocator, 663 block_params); 664 allocator->Commit(); 665 666 // We loop over large blocks of the RHS. 667 for (int c = 0; c < cols; c += block_params.l2_cols) { 668 int cs = std::min(block_params.l2_cols, cols - c); 669 670 // Pack a large block of the RHS. 671 PackRhs(&packed_rhs, rhs.block(0, c, depth, cs)); 672 673 // Give work to each worker. 674 std::vector<Task*> tasks; 675 int next_start_row = 0; 676 for (int n = 0; n < task_count; ++n) { 677 int start_row = next_start_row; 678 next_start_row = std::min(rows, RoundUp<KernelFormat::kRows>( 679 rows * (n + 1) / task_count)); 680 681 int block_rows = next_start_row - start_row; 682 auto lhs_block = lhs.block(start_row, 0, block_rows, depth); 683 typedef GemmWithPackedRhsTask< 684 KernelFormat, InputScalar, OutputScalar, BitDepthParams, LhsOrder, 685 RhsOrder, ResultOrder, LhsOffset, RhsOffset, OutputPipelineType, 686 GemmContextType> 687 TaskType; 688 tasks.push_back(new TaskType(context, kernel, lhs_block, packed_rhs, result, 689 MatrixBlockBounds(start_row, c, block_rows, cs), 690 lhs_offset, rhs_offset, output_pipeline)); 691 } 692 // Execute the work on the workers (and partially on this thread). 693 workers_pool->Execute(tasks); 694 } 695 696 allocator->Decommit(); 697 } 698 699 } // namespace gemmlowp 700 701 #endif // GEMMLOWP_INTERNAL_MULTI_THREAD_GEMM_H_ 702