Home | History | Annotate | Download | only in internal
      1 // Copyright 2015 The Gemmlowp Authors. All Rights Reserved.
      2 //
      3 // Licensed under the Apache License, Version 2.0 (the "License");
      4 // you may not use this file except in compliance with the License.
      5 // You may obtain a copy of the License at
      6 //
      7 //     http://www.apache.org/licenses/LICENSE-2.0
      8 //
      9 // Unless required by applicable law or agreed to in writing, software
     10 // distributed under the License is distributed on an "AS IS" BASIS,
     11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     12 // See the License for the specific language governing permissions and
     13 // limitations under the License.
     14 
     15 // single_thread_gemm.h: Single-threaded GEMM implementation.
     16 // This is a good place to start reading code, as it shows the overall
     17 // structure of a GEMM and is much simpler than multi_thread_gemm.h.
     18 
     19 #ifndef GEMMLOWP_INTERNAL_SINGLE_THREAD_GEMM_H_
     20 #define GEMMLOWP_INTERNAL_SINGLE_THREAD_GEMM_H_
     21 
     22 #include <cassert>
     23 
     24 #include "../public/map.h"
     25 #include "allocator.h"
     26 #include "compute.h"
     27 #include "kernel.h"
     28 #include "pack.h"
     29 #include "unpack.h"
     30 
     31 #ifdef GEMMLOWP_PROFILING_SIZES
     32 #ifndef GEMMLOWP_PROFILING
     33 #error GEMMLOWP_PROFILING_SIZES without GEMMLOWP_PROFILING
     34 #endif
     35 #include <string>
     36 #include <unordered_map>
     37 #endif
     38 
     39 namespace gemmlowp {
     40 
     41 class SingleThreadGemmContext {
     42  public:
     43   Allocator* allocator() { return &allocator_; }
     44 
     45   void set_l1_bytes_to_use(int n) { l1_bytes_to_use_ = n; }
     46   void set_l2_bytes_to_use(int n) { l2_bytes_to_use_ = n; }
     47   void set_l2_rhs_factor(float n) { l2_rhs_factor_ = n; }
     48 
     49   int l1_bytes_to_use() const { return l1_bytes_to_use_; }
     50   int l2_bytes_to_use() const { return l2_bytes_to_use_; }
     51   float l2_rhs_factor() const { return l2_rhs_factor_; }
     52 
     53  protected:
     54   Allocator allocator_;
     55 
     56   // The cache configurationt to use.
     57   int l1_bytes_to_use_ = kDefaultL1CacheSize;
     58   int l2_bytes_to_use_ = kDefaultL2CacheSize;
     59   float l2_rhs_factor_ = kDefaultL2RhsFactor;
     60 };
     61 
     62 template <typename KernelFormat, typename InputScalar, typename OutputScalar,
     63           typename BitDepthParams, MapOrder LhsOrder, MapOrder RhsOrder,
     64           MapOrder ResultOrder, typename LhsOffset, typename RhsOffset,
     65           typename OutputPipelineType>
     66 void SingleThreadGemm(SingleThreadGemmContext* context,
     67                       const KernelBase& kernel,
     68                       const MatrixMap<const InputScalar, LhsOrder>& lhs,
     69                       const MatrixMap<const InputScalar, RhsOrder>& rhs,
     70                       MatrixMap<OutputScalar, ResultOrder>* result,
     71                       const LhsOffset& lhs_offset, const RhsOffset& rhs_offset,
     72                       const OutputPipelineType& output_pipeline) {
     73   ScopedProfilingLabel label("gemmlowp::SingleThreadGemm");
     74 
     75   assert(lhs.cols() == rhs.rows());
     76 
     77   int rows = result->rows();
     78   int cols = result->cols();
     79   int depth = lhs.cols();
     80 
     81   // zero sizes should have been caught earlier and early-returned.
     82   assert(rows > 0);
     83   assert(cols > 0);
     84   assert(depth > 0);
     85 
     86   // The case of rows<cols should have been caught earlier and transposed.
     87   assert(rows >= cols);
     88 
     89   Allocator* allocator = context->allocator();
     90 
     91   BlockParams block_params;
     92   block_params.Init<KernelFormat>(
     93       rows, cols, depth, 1, context->l1_bytes_to_use(),
     94       context->l2_bytes_to_use(), context->l2_rhs_factor());
     95 
     96 #ifdef GEMMLOWP_PROFILING_SIZES
     97   // Using a static map of label strings. Not reentrant at all!
     98   static std::unordered_map<std::uint64_t, std::string> labels_map;
     99   std::uint64_t sizes_hash = static_cast<std::uint64_t>(rows) ^
    100                              (static_cast<std::uint64_t>(depth) << 16) ^
    101                              (static_cast<std::uint64_t>(cols) << 32);
    102   if (!labels_map.count(sizes_hash)) {
    103     char label[256];
    104     snprintf(label, sizeof(label),
    105              "(rows = %d, depth = %d, cols = %d, l2_rows = %d, l2_depth = %d, "
    106              "l2_cols = %d, l1_rows = %d, l1_depth = %d, l1_cols = %d)",
    107              rows, depth, cols, block_params.l2_rows, block_params.l2_depth,
    108              block_params.l2_cols, block_params.l1_rows, block_params.l1_depth,
    109              block_params.l1_cols);
    110     labels_map[sizes_hash] = label;
    111   }
    112   ScopedProfilingLabel size_label(labels_map[sizes_hash].c_str());
    113 #endif
    114 
    115   PackedSideBlock<typename KernelFormat::Lhs> packed_lhs(Side::Lhs, allocator,
    116                                                          block_params);
    117   PackedSideBlock<typename KernelFormat::Rhs> packed_rhs(Side::Rhs, allocator,
    118                                                          block_params);
    119 
    120   PackedResult packed_result(allocator, block_params);
    121 
    122   allocator->Commit();
    123 
    124   const bool pack_rhs_once = block_params.l2_cols >= cols;
    125 
    126   if (pack_rhs_once) {
    127     PackRhs(&packed_rhs, rhs);
    128   }
    129 
    130   for (int r = 0; r < rows; r += block_params.l2_rows) {
    131     int rs = std::min(block_params.l2_rows, rows - r);
    132 
    133     PackLhs(&packed_lhs, lhs.block(r, 0, rs, depth));
    134 
    135     for (int c = 0; c < cols; c += block_params.l2_cols) {
    136       int cs = std::min(block_params.l2_cols, cols - c);
    137 
    138       if (!pack_rhs_once) {
    139         PackRhs(&packed_rhs, rhs.block(0, c, depth, cs));
    140       }
    141 
    142       Compute(kernel, block_params, &packed_result, packed_lhs, packed_rhs,
    143               depth);
    144 
    145       UnpackResult<KernelFormat>(
    146           result, MatrixBlockBounds(r, c, rs, cs), packed_result, depth,
    147           packed_lhs.sums_of_each_slice(), packed_rhs.sums_of_each_slice(),
    148           lhs_offset.block(r, rs), rhs_offset.block(c, cs), output_pipeline);
    149     }
    150   }
    151 
    152   allocator->Decommit();
    153 }
    154 
    155 }  // namespace gemmlowp
    156 
    157 #endif  // GEMMLOWP_INTERNAL_SINGLE_THREAD_GEMM_H_
    158