Home | History | Annotate | Download | only in products
      1 // This file is part of Eigen, a lightweight C++ template library
      2 // for linear algebra.
      3 //
      4 // Copyright (C) 2008-2009 Gael Guennebaud <gael.guennebaud (at) inria.fr>
      5 //
      6 // This Source Code Form is subject to the terms of the Mozilla
      7 // Public License v. 2.0. If a copy of the MPL was not distributed
      8 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
      9 
     10 #ifndef EIGEN_GENERAL_BLOCK_PANEL_H
     11 #define EIGEN_GENERAL_BLOCK_PANEL_H
     12 
     13 
     14 namespace Eigen {
     15 
     16 namespace internal {
     17 
     18 template<typename _LhsScalar, typename _RhsScalar, bool _ConjLhs=false, bool _ConjRhs=false>
     19 class gebp_traits;
     20 
     21 
     22 /** \internal \returns b if a<=0, and returns a otherwise. */
     23 inline std::ptrdiff_t manage_caching_sizes_helper(std::ptrdiff_t a, std::ptrdiff_t b)
     24 {
     25   return a<=0 ? b : a;
     26 }
     27 
     28 #if EIGEN_ARCH_i386_OR_x86_64
     29 const std::ptrdiff_t defaultL1CacheSize = 32*1024;
     30 const std::ptrdiff_t defaultL2CacheSize = 256*1024;
     31 const std::ptrdiff_t defaultL3CacheSize = 2*1024*1024;
     32 #else
     33 const std::ptrdiff_t defaultL1CacheSize = 16*1024;
     34 const std::ptrdiff_t defaultL2CacheSize = 512*1024;
     35 const std::ptrdiff_t defaultL3CacheSize = 512*1024;
     36 #endif
     37 
     38 /** \internal */
     39 struct CacheSizes {
     40   CacheSizes(): m_l1(-1),m_l2(-1),m_l3(-1) {
     41     int l1CacheSize, l2CacheSize, l3CacheSize;
     42     queryCacheSizes(l1CacheSize, l2CacheSize, l3CacheSize);
     43     m_l1 = manage_caching_sizes_helper(l1CacheSize, defaultL1CacheSize);
     44     m_l2 = manage_caching_sizes_helper(l2CacheSize, defaultL2CacheSize);
     45     m_l3 = manage_caching_sizes_helper(l3CacheSize, defaultL3CacheSize);
     46   }
     47 
     48   std::ptrdiff_t m_l1;
     49   std::ptrdiff_t m_l2;
     50   std::ptrdiff_t m_l3;
     51 };
     52 
     53 
     54 /** \internal */
     55 inline void manage_caching_sizes(Action action, std::ptrdiff_t* l1, std::ptrdiff_t* l2, std::ptrdiff_t* l3)
     56 {
     57   static CacheSizes m_cacheSizes;
     58 
     59   if(action==SetAction)
     60   {
     61     // set the cpu cache size and cache all block sizes from a global cache size in byte
     62     eigen_internal_assert(l1!=0 && l2!=0);
     63     m_cacheSizes.m_l1 = *l1;
     64     m_cacheSizes.m_l2 = *l2;
     65     m_cacheSizes.m_l3 = *l3;
     66   }
     67   else if(action==GetAction)
     68   {
     69     eigen_internal_assert(l1!=0 && l2!=0);
     70     *l1 = m_cacheSizes.m_l1;
     71     *l2 = m_cacheSizes.m_l2;
     72     *l3 = m_cacheSizes.m_l3;
     73   }
     74   else
     75   {
     76     eigen_internal_assert(false);
     77   }
     78 }
     79 
     80 /* Helper for computeProductBlockingSizes.
     81  *
     82  * Given a m x k times k x n matrix product of scalar types \c LhsScalar and \c RhsScalar,
     83  * this function computes the blocking size parameters along the respective dimensions
     84  * for matrix products and related algorithms. The blocking sizes depends on various
     85  * parameters:
     86  * - the L1 and L2 cache sizes,
     87  * - the register level blocking sizes defined by gebp_traits,
     88  * - the number of scalars that fit into a packet (when vectorization is enabled).
     89  *
     90  * \sa setCpuCacheSizes */
     91 
     92 template<typename LhsScalar, typename RhsScalar, int KcFactor, typename Index>
     93 void evaluateProductBlockingSizesHeuristic(Index& k, Index& m, Index& n, Index num_threads = 1)
     94 {
     95   typedef gebp_traits<LhsScalar,RhsScalar> Traits;
     96 
     97   // Explanations:
     98   // Let's recall that the product algorithms form mc x kc vertical panels A' on the lhs and
     99   // kc x nc blocks B' on the rhs. B' has to fit into L2/L3 cache. Moreover, A' is processed
    100   // per mr x kc horizontal small panels where mr is the blocking size along the m dimension
    101   // at the register level. This small horizontal panel has to stay within L1 cache.
    102   std::ptrdiff_t l1, l2, l3;
    103   manage_caching_sizes(GetAction, &l1, &l2, &l3);
    104 
    105   if (num_threads > 1) {
    106     typedef typename Traits::ResScalar ResScalar;
    107     enum {
    108       kdiv = KcFactor * (Traits::mr * sizeof(LhsScalar) + Traits::nr * sizeof(RhsScalar)),
    109       ksub = Traits::mr * Traits::nr * sizeof(ResScalar),
    110       kr = 8,
    111       mr = Traits::mr,
    112       nr = Traits::nr
    113     };
    114     // Increasing k gives us more time to prefetch the content of the "C"
    115     // registers. However once the latency is hidden there is no point in
    116     // increasing the value of k, so we'll cap it at 320 (value determined
    117     // experimentally).
    118     const Index k_cache = (numext::mini<Index>)((l1-ksub)/kdiv, 320);
    119     if (k_cache < k) {
    120       k = k_cache - (k_cache % kr);
    121       eigen_internal_assert(k > 0);
    122     }
    123 
    124     const Index n_cache = (l2-l1) / (nr * sizeof(RhsScalar) * k);
    125     const Index n_per_thread = numext::div_ceil(n, num_threads);
    126     if (n_cache <= n_per_thread) {
    127       // Don't exceed the capacity of the l2 cache.
    128       eigen_internal_assert(n_cache >= static_cast<Index>(nr));
    129       n = n_cache - (n_cache % nr);
    130       eigen_internal_assert(n > 0);
    131     } else {
    132       n = (numext::mini<Index>)(n, (n_per_thread + nr - 1) - ((n_per_thread + nr - 1) % nr));
    133     }
    134 
    135     if (l3 > l2) {
    136       // l3 is shared between all cores, so we'll give each thread its own chunk of l3.
    137       const Index m_cache = (l3-l2) / (sizeof(LhsScalar) * k * num_threads);
    138       const Index m_per_thread = numext::div_ceil(m, num_threads);
    139       if(m_cache < m_per_thread && m_cache >= static_cast<Index>(mr)) {
    140         m = m_cache - (m_cache % mr);
    141         eigen_internal_assert(m > 0);
    142       } else {
    143         m = (numext::mini<Index>)(m, (m_per_thread + mr - 1) - ((m_per_thread + mr - 1) % mr));
    144       }
    145     }
    146   }
    147   else {
    148     // In unit tests we do not want to use extra large matrices,
    149     // so we reduce the cache size to check the blocking strategy is not flawed
    150 #ifdef EIGEN_DEBUG_SMALL_PRODUCT_BLOCKS
    151     l1 = 9*1024;
    152     l2 = 32*1024;
    153     l3 = 512*1024;
    154 #endif
    155 
    156     // Early return for small problems because the computation below are time consuming for small problems.
    157     // Perhaps it would make more sense to consider k*n*m??
    158     // Note that for very tiny problem, this function should be bypassed anyway
    159     // because we use the coefficient-based implementation for them.
    160     if((numext::maxi)(k,(numext::maxi)(m,n))<48)
    161       return;
    162 
    163     typedef typename Traits::ResScalar ResScalar;
    164     enum {
    165       k_peeling = 8,
    166       k_div = KcFactor * (Traits::mr * sizeof(LhsScalar) + Traits::nr * sizeof(RhsScalar)),
    167       k_sub = Traits::mr * Traits::nr * sizeof(ResScalar)
    168     };
    169 
    170     // ---- 1st level of blocking on L1, yields kc ----
    171 
    172     // Blocking on the third dimension (i.e., k) is chosen so that an horizontal panel
    173     // of size mr x kc of the lhs plus a vertical panel of kc x nr of the rhs both fits within L1 cache.
    174     // We also include a register-level block of the result (mx x nr).
    175     // (In an ideal world only the lhs panel would stay in L1)
    176     // Moreover, kc has to be a multiple of 8 to be compatible with loop peeling, leading to a maximum blocking size of:
    177     const Index max_kc = numext::maxi<Index>(((l1-k_sub)/k_div) & (~(k_peeling-1)),1);
    178     const Index old_k = k;
    179     if(k>max_kc)
    180     {
    181       // We are really blocking on the third dimension:
    182       // -> reduce blocking size to make sure the last block is as large as possible
    183       //    while keeping the same number of sweeps over the result.
    184       k = (k%max_kc)==0 ? max_kc
    185                         : max_kc - k_peeling * ((max_kc-1-(k%max_kc))/(k_peeling*(k/max_kc+1)));
    186 
    187       eigen_internal_assert(((old_k/k) == (old_k/max_kc)) && "the number of sweeps has to remain the same");
    188     }
    189 
    190     // ---- 2nd level of blocking on max(L2,L3), yields nc ----
    191 
    192     // TODO find a reliable way to get the actual amount of cache per core to use for 2nd level blocking, that is:
    193     //      actual_l2 = max(l2, l3/nb_core_sharing_l3)
    194     // The number below is quite conservative: it is better to underestimate the cache size rather than overestimating it)
    195     // For instance, it corresponds to 6MB of L3 shared among 4 cores.
    196     #ifdef EIGEN_DEBUG_SMALL_PRODUCT_BLOCKS
    197     const Index actual_l2 = l3;
    198     #else
    199     const Index actual_l2 = 1572864; // == 1.5 MB
    200     #endif
    201 
    202     // Here, nc is chosen such that a block of kc x nc of the rhs fit within half of L2.
    203     // The second half is implicitly reserved to access the result and lhs coefficients.
    204     // When k<max_kc, then nc can arbitrarily growth. In practice, it seems to be fruitful
    205     // to limit this growth: we bound nc to growth by a factor x1.5.
    206     // However, if the entire lhs block fit within L1, then we are not going to block on the rows at all,
    207     // and it becomes fruitful to keep the packed rhs blocks in L1 if there is enough remaining space.
    208     Index max_nc;
    209     const Index lhs_bytes = m * k * sizeof(LhsScalar);
    210     const Index remaining_l1 = l1- k_sub - lhs_bytes;
    211     if(remaining_l1 >= Index(Traits::nr*sizeof(RhsScalar))*k)
    212     {
    213       // L1 blocking
    214       max_nc = remaining_l1 / (k*sizeof(RhsScalar));
    215     }
    216     else
    217     {
    218       // L2 blocking
    219       max_nc = (3*actual_l2)/(2*2*max_kc*sizeof(RhsScalar));
    220     }
    221     // WARNING Below, we assume that Traits::nr is a power of two.
    222     Index nc = numext::mini<Index>(actual_l2/(2*k*sizeof(RhsScalar)), max_nc) & (~(Traits::nr-1));
    223     if(n>nc)
    224     {
    225       // We are really blocking over the columns:
    226       // -> reduce blocking size to make sure the last block is as large as possible
    227       //    while keeping the same number of sweeps over the packed lhs.
    228       //    Here we allow one more sweep if this gives us a perfect match, thus the commented "-1"
    229       n = (n%nc)==0 ? nc
    230                     : (nc - Traits::nr * ((nc/*-1*/-(n%nc))/(Traits::nr*(n/nc+1))));
    231     }
    232     else if(old_k==k)
    233     {
    234       // So far, no blocking at all, i.e., kc==k, and nc==n.
    235       // In this case, let's perform a blocking over the rows such that the packed lhs data is kept in cache L1/L2
    236       // TODO: part of this blocking strategy is now implemented within the kernel itself, so the L1-based heuristic here should be obsolete.
    237       Index problem_size = k*n*sizeof(LhsScalar);
    238       Index actual_lm = actual_l2;
    239       Index max_mc = m;
    240       if(problem_size<=1024)
    241       {
    242         // problem is small enough to keep in L1
    243         // Let's choose m such that lhs's block fit in 1/3 of L1
    244         actual_lm = l1;
    245       }
    246       else if(l3!=0 && problem_size<=32768)
    247       {
    248         // we have both L2 and L3, and problem is small enough to be kept in L2
    249         // Let's choose m such that lhs's block fit in 1/3 of L2
    250         actual_lm = l2;
    251         max_mc = (numext::mini<Index>)(576,max_mc);
    252       }
    253       Index mc = (numext::mini<Index>)(actual_lm/(3*k*sizeof(LhsScalar)), max_mc);
    254       if (mc > Traits::mr) mc -= mc % Traits::mr;
    255       else if (mc==0) return;
    256       m = (m%mc)==0 ? mc
    257                     : (mc - Traits::mr * ((mc/*-1*/-(m%mc))/(Traits::mr*(m/mc+1))));
    258     }
    259   }
    260 }
    261 
    262 template <typename Index>
    263 inline bool useSpecificBlockingSizes(Index& k, Index& m, Index& n)
    264 {
    265 #ifdef EIGEN_TEST_SPECIFIC_BLOCKING_SIZES
    266   if (EIGEN_TEST_SPECIFIC_BLOCKING_SIZES) {
    267     k = numext::mini<Index>(k, EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_K);
    268     m = numext::mini<Index>(m, EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_M);
    269     n = numext::mini<Index>(n, EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_N);
    270     return true;
    271   }
    272 #else
    273   EIGEN_UNUSED_VARIABLE(k)
    274   EIGEN_UNUSED_VARIABLE(m)
    275   EIGEN_UNUSED_VARIABLE(n)
    276 #endif
    277   return false;
    278 }
    279 
    280 /** \brief Computes the blocking parameters for a m x k times k x n matrix product
    281   *
    282   * \param[in,out] k Input: the third dimension of the product. Output: the blocking size along the same dimension.
    283   * \param[in,out] m Input: the number of rows of the left hand side. Output: the blocking size along the same dimension.
    284   * \param[in,out] n Input: the number of columns of the right hand side. Output: the blocking size along the same dimension.
    285   *
    286   * Given a m x k times k x n matrix product of scalar types \c LhsScalar and \c RhsScalar,
    287   * this function computes the blocking size parameters along the respective dimensions
    288   * for matrix products and related algorithms.
    289   *
    290   * The blocking size parameters may be evaluated:
    291   *   - either by a heuristic based on cache sizes;
    292   *   - or using fixed prescribed values (for testing purposes).
    293   *
    294   * \sa setCpuCacheSizes */
    295 
    296 template<typename LhsScalar, typename RhsScalar, int KcFactor, typename Index>
    297 void computeProductBlockingSizes(Index& k, Index& m, Index& n, Index num_threads = 1)
    298 {
    299   if (!useSpecificBlockingSizes(k, m, n)) {
    300     evaluateProductBlockingSizesHeuristic<LhsScalar, RhsScalar, KcFactor, Index>(k, m, n, num_threads);
    301   }
    302 }
    303 
    304 template<typename LhsScalar, typename RhsScalar, typename Index>
    305 inline void computeProductBlockingSizes(Index& k, Index& m, Index& n, Index num_threads = 1)
    306 {
    307   computeProductBlockingSizes<LhsScalar,RhsScalar,1,Index>(k, m, n, num_threads);
    308 }
    309 
    310 #ifdef EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD
    311   #define CJMADD(CJ,A,B,C,T)  C = CJ.pmadd(A,B,C);
    312 #else
    313 
    314   // FIXME (a bit overkill maybe ?)
    315 
    316   template<typename CJ, typename A, typename B, typename C, typename T> struct gebp_madd_selector {
    317     EIGEN_ALWAYS_INLINE static void run(const CJ& cj, A& a, B& b, C& c, T& /*t*/)
    318     {
    319       c = cj.pmadd(a,b,c);
    320     }
    321   };
    322 
    323   template<typename CJ, typename T> struct gebp_madd_selector<CJ,T,T,T,T> {
    324     EIGEN_ALWAYS_INLINE static void run(const CJ& cj, T& a, T& b, T& c, T& t)
    325     {
    326       t = b; t = cj.pmul(a,t); c = padd(c,t);
    327     }
    328   };
    329 
    330   template<typename CJ, typename A, typename B, typename C, typename T>
    331   EIGEN_STRONG_INLINE void gebp_madd(const CJ& cj, A& a, B& b, C& c, T& t)
    332   {
    333     gebp_madd_selector<CJ,A,B,C,T>::run(cj,a,b,c,t);
    334   }
    335 
    336   #define CJMADD(CJ,A,B,C,T)  gebp_madd(CJ,A,B,C,T);
    337 //   #define CJMADD(CJ,A,B,C,T)  T = B; T = CJ.pmul(A,T); C = padd(C,T);
    338 #endif
    339 
    340 /* Vectorization logic
    341  *  real*real: unpack rhs to constant packets, ...
    342  *
    343  *  cd*cd : unpack rhs to (b_r,b_r), (b_i,b_i), mul to get (a_r b_r,a_i b_r) (a_r b_i,a_i b_i),
    344  *          storing each res packet into two packets (2x2),
    345  *          at the end combine them: swap the second and addsub them
    346  *  cf*cf : same but with 2x4 blocks
    347  *  cplx*real : unpack rhs to constant packets, ...
    348  *  real*cplx : load lhs as (a0,a0,a1,a1), and mul as usual
    349  */
    350 template<typename _LhsScalar, typename _RhsScalar, bool _ConjLhs, bool _ConjRhs>
    351 class gebp_traits
    352 {
    353 public:
    354   typedef _LhsScalar LhsScalar;
    355   typedef _RhsScalar RhsScalar;
    356   typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar>::ReturnType ResScalar;
    357 
    358   enum {
    359     ConjLhs = _ConjLhs,
    360     ConjRhs = _ConjRhs,
    361     Vectorizable = packet_traits<LhsScalar>::Vectorizable && packet_traits<RhsScalar>::Vectorizable,
    362     LhsPacketSize = Vectorizable ? packet_traits<LhsScalar>::size : 1,
    363     RhsPacketSize = Vectorizable ? packet_traits<RhsScalar>::size : 1,
    364     ResPacketSize = Vectorizable ? packet_traits<ResScalar>::size : 1,
    365 
    366     NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS,
    367 
    368     // register block size along the N direction must be 1 or 4
    369     nr = 4,
    370 
    371     // register block size along the M direction (currently, this one cannot be modified)
    372     default_mr = (EIGEN_PLAIN_ENUM_MIN(16,NumberOfRegisters)/2/nr)*LhsPacketSize,
    373 #if defined(EIGEN_HAS_SINGLE_INSTRUCTION_MADD) && !defined(EIGEN_VECTORIZE_ALTIVEC) && !defined(EIGEN_VECTORIZE_VSX)
    374     // we assume 16 registers
    375     // See bug 992, if the scalar type is not vectorizable but that EIGEN_HAS_SINGLE_INSTRUCTION_MADD is defined,
    376     // then using 3*LhsPacketSize triggers non-implemented paths in syrk.
    377     mr = Vectorizable ? 3*LhsPacketSize : default_mr,
    378 #else
    379     mr = default_mr,
    380 #endif
    381 
    382     LhsProgress = LhsPacketSize,
    383     RhsProgress = 1
    384   };
    385 
    386   typedef typename packet_traits<LhsScalar>::type  _LhsPacket;
    387   typedef typename packet_traits<RhsScalar>::type  _RhsPacket;
    388   typedef typename packet_traits<ResScalar>::type  _ResPacket;
    389 
    390   typedef typename conditional<Vectorizable,_LhsPacket,LhsScalar>::type LhsPacket;
    391   typedef typename conditional<Vectorizable,_RhsPacket,RhsScalar>::type RhsPacket;
    392   typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket;
    393 
    394   typedef ResPacket AccPacket;
    395 
    396   EIGEN_STRONG_INLINE void initAcc(AccPacket& p)
    397   {
    398     p = pset1<ResPacket>(ResScalar(0));
    399   }
    400 
    401   EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1, RhsPacket& b2, RhsPacket& b3)
    402   {
    403     pbroadcast4(b, b0, b1, b2, b3);
    404   }
    405 
    406 //   EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1)
    407 //   {
    408 //     pbroadcast2(b, b0, b1);
    409 //   }
    410 
    411   template<typename RhsPacketType>
    412   EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketType& dest) const
    413   {
    414     dest = pset1<RhsPacketType>(*b);
    415   }
    416 
    417   EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const
    418   {
    419     dest = ploadquad<RhsPacket>(b);
    420   }
    421 
    422   template<typename LhsPacketType>
    423   EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacketType& dest) const
    424   {
    425     dest = pload<LhsPacketType>(a);
    426   }
    427 
    428   template<typename LhsPacketType>
    429   EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar* a, LhsPacketType& dest) const
    430   {
    431     dest = ploadu<LhsPacketType>(a);
    432   }
    433 
    434   template<typename LhsPacketType, typename RhsPacketType, typename AccPacketType>
    435   EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketType& b, AccPacketType& c, AccPacketType& tmp) const
    436   {
    437     conj_helper<LhsPacketType,RhsPacketType,ConjLhs,ConjRhs> cj;
    438     // It would be a lot cleaner to call pmadd all the time. Unfortunately if we
    439     // let gcc allocate the register in which to store the result of the pmul
    440     // (in the case where there is no FMA) gcc fails to figure out how to avoid
    441     // spilling register.
    442 #ifdef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
    443     EIGEN_UNUSED_VARIABLE(tmp);
    444     c = cj.pmadd(a,b,c);
    445 #else
    446     tmp = b; tmp = cj.pmul(a,tmp); c = padd(c,tmp);
    447 #endif
    448   }
    449 
    450   EIGEN_STRONG_INLINE void acc(const AccPacket& c, const ResPacket& alpha, ResPacket& r) const
    451   {
    452     r = pmadd(c,alpha,r);
    453   }
    454 
    455   template<typename ResPacketHalf>
    456   EIGEN_STRONG_INLINE void acc(const ResPacketHalf& c, const ResPacketHalf& alpha, ResPacketHalf& r) const
    457   {
    458     r = pmadd(c,alpha,r);
    459   }
    460 
    461 };
    462 
    463 template<typename RealScalar, bool _ConjLhs>
    464 class gebp_traits<std::complex<RealScalar>, RealScalar, _ConjLhs, false>
    465 {
    466 public:
    467   typedef std::complex<RealScalar> LhsScalar;
    468   typedef RealScalar RhsScalar;
    469   typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar>::ReturnType ResScalar;
    470 
    471   enum {
    472     ConjLhs = _ConjLhs,
    473     ConjRhs = false,
    474     Vectorizable = packet_traits<LhsScalar>::Vectorizable && packet_traits<RhsScalar>::Vectorizable,
    475     LhsPacketSize = Vectorizable ? packet_traits<LhsScalar>::size : 1,
    476     RhsPacketSize = Vectorizable ? packet_traits<RhsScalar>::size : 1,
    477     ResPacketSize = Vectorizable ? packet_traits<ResScalar>::size : 1,
    478 
    479     NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS,
    480     nr = 4,
    481 #if defined(EIGEN_HAS_SINGLE_INSTRUCTION_MADD) && !defined(EIGEN_VECTORIZE_ALTIVEC) && !defined(EIGEN_VECTORIZE_VSX)
    482     // we assume 16 registers
    483     mr = 3*LhsPacketSize,
    484 #else
    485     mr = (EIGEN_PLAIN_ENUM_MIN(16,NumberOfRegisters)/2/nr)*LhsPacketSize,
    486 #endif
    487 
    488     LhsProgress = LhsPacketSize,
    489     RhsProgress = 1
    490   };
    491 
    492   typedef typename packet_traits<LhsScalar>::type  _LhsPacket;
    493   typedef typename packet_traits<RhsScalar>::type  _RhsPacket;
    494   typedef typename packet_traits<ResScalar>::type  _ResPacket;
    495 
    496   typedef typename conditional<Vectorizable,_LhsPacket,LhsScalar>::type LhsPacket;
    497   typedef typename conditional<Vectorizable,_RhsPacket,RhsScalar>::type RhsPacket;
    498   typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket;
    499 
    500   typedef ResPacket AccPacket;
    501 
    502   EIGEN_STRONG_INLINE void initAcc(AccPacket& p)
    503   {
    504     p = pset1<ResPacket>(ResScalar(0));
    505   }
    506 
    507   EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const
    508   {
    509     dest = pset1<RhsPacket>(*b);
    510   }
    511 
    512   EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const
    513   {
    514     dest = pset1<RhsPacket>(*b);
    515   }
    516 
    517   EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacket& dest) const
    518   {
    519     dest = pload<LhsPacket>(a);
    520   }
    521 
    522   EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar* a, LhsPacket& dest) const
    523   {
    524     dest = ploadu<LhsPacket>(a);
    525   }
    526 
    527   EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1, RhsPacket& b2, RhsPacket& b3)
    528   {
    529     pbroadcast4(b, b0, b1, b2, b3);
    530   }
    531 
    532 //   EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1)
    533 //   {
    534 //     pbroadcast2(b, b0, b1);
    535 //   }
    536 
    537   EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& tmp) const
    538   {
    539     madd_impl(a, b, c, tmp, typename conditional<Vectorizable,true_type,false_type>::type());
    540   }
    541 
    542   EIGEN_STRONG_INLINE void madd_impl(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& tmp, const true_type&) const
    543   {
    544 #ifdef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
    545     EIGEN_UNUSED_VARIABLE(tmp);
    546     c.v = pmadd(a.v,b,c.v);
    547 #else
    548     tmp = b; tmp = pmul(a.v,tmp); c.v = padd(c.v,tmp);
    549 #endif
    550   }
    551 
    552   EIGEN_STRONG_INLINE void madd_impl(const LhsScalar& a, const RhsScalar& b, ResScalar& c, RhsScalar& /*tmp*/, const false_type&) const
    553   {
    554     c += a * b;
    555   }
    556 
    557   EIGEN_STRONG_INLINE void acc(const AccPacket& c, const ResPacket& alpha, ResPacket& r) const
    558   {
    559     r = cj.pmadd(c,alpha,r);
    560   }
    561 
    562 protected:
    563   conj_helper<ResPacket,ResPacket,ConjLhs,false> cj;
    564 };
    565 
    566 template<typename Packet>
    567 struct DoublePacket
    568 {
    569   Packet first;
    570   Packet second;
    571 };
    572 
    573 template<typename Packet>
    574 DoublePacket<Packet> padd(const DoublePacket<Packet> &a, const DoublePacket<Packet> &b)
    575 {
    576   DoublePacket<Packet> res;
    577   res.first  = padd(a.first, b.first);
    578   res.second = padd(a.second,b.second);
    579   return res;
    580 }
    581 
    582 template<typename Packet>
    583 const DoublePacket<Packet>& predux_downto4(const DoublePacket<Packet> &a)
    584 {
    585   return a;
    586 }
    587 
    588 template<typename Packet> struct unpacket_traits<DoublePacket<Packet> > { typedef DoublePacket<Packet> half; };
    589 // template<typename Packet>
    590 // DoublePacket<Packet> pmadd(const DoublePacket<Packet> &a, const DoublePacket<Packet> &b)
    591 // {
    592 //   DoublePacket<Packet> res;
    593 //   res.first  = padd(a.first, b.first);
    594 //   res.second = padd(a.second,b.second);
    595 //   return res;
    596 // }
    597 
    598 template<typename RealScalar, bool _ConjLhs, bool _ConjRhs>
    599 class gebp_traits<std::complex<RealScalar>, std::complex<RealScalar>, _ConjLhs, _ConjRhs >
    600 {
    601 public:
    602   typedef std::complex<RealScalar>  Scalar;
    603   typedef std::complex<RealScalar>  LhsScalar;
    604   typedef std::complex<RealScalar>  RhsScalar;
    605   typedef std::complex<RealScalar>  ResScalar;
    606 
    607   enum {
    608     ConjLhs = _ConjLhs,
    609     ConjRhs = _ConjRhs,
    610     Vectorizable = packet_traits<RealScalar>::Vectorizable
    611                 && packet_traits<Scalar>::Vectorizable,
    612     RealPacketSize  = Vectorizable ? packet_traits<RealScalar>::size : 1,
    613     ResPacketSize   = Vectorizable ? packet_traits<ResScalar>::size : 1,
    614     LhsPacketSize = Vectorizable ? packet_traits<LhsScalar>::size : 1,
    615     RhsPacketSize = Vectorizable ? packet_traits<RhsScalar>::size : 1,
    616 
    617     // FIXME: should depend on NumberOfRegisters
    618     nr = 4,
    619     mr = ResPacketSize,
    620 
    621     LhsProgress = ResPacketSize,
    622     RhsProgress = 1
    623   };
    624 
    625   typedef typename packet_traits<RealScalar>::type RealPacket;
    626   typedef typename packet_traits<Scalar>::type     ScalarPacket;
    627   typedef DoublePacket<RealPacket> DoublePacketType;
    628 
    629   typedef typename conditional<Vectorizable,RealPacket,  Scalar>::type LhsPacket;
    630   typedef typename conditional<Vectorizable,DoublePacketType,Scalar>::type RhsPacket;
    631   typedef typename conditional<Vectorizable,ScalarPacket,Scalar>::type ResPacket;
    632   typedef typename conditional<Vectorizable,DoublePacketType,Scalar>::type AccPacket;
    633 
    634   EIGEN_STRONG_INLINE void initAcc(Scalar& p) { p = Scalar(0); }
    635 
    636   EIGEN_STRONG_INLINE void initAcc(DoublePacketType& p)
    637   {
    638     p.first   = pset1<RealPacket>(RealScalar(0));
    639     p.second  = pset1<RealPacket>(RealScalar(0));
    640   }
    641 
    642   // Scalar path
    643   EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, ResPacket& dest) const
    644   {
    645     dest = pset1<ResPacket>(*b);
    646   }
    647 
    648   // Vectorized path
    649   EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, DoublePacketType& dest) const
    650   {
    651     dest.first  = pset1<RealPacket>(real(*b));
    652     dest.second = pset1<RealPacket>(imag(*b));
    653   }
    654 
    655   EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, ResPacket& dest) const
    656   {
    657     loadRhs(b,dest);
    658   }
    659   EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, DoublePacketType& dest) const
    660   {
    661     eigen_internal_assert(unpacket_traits<ScalarPacket>::size<=4);
    662     loadRhs(b,dest);
    663   }
    664 
    665   EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1, RhsPacket& b2, RhsPacket& b3)
    666   {
    667     // FIXME not sure that's the best way to implement it!
    668     loadRhs(b+0, b0);
    669     loadRhs(b+1, b1);
    670     loadRhs(b+2, b2);
    671     loadRhs(b+3, b3);
    672   }
    673 
    674   // Vectorized path
    675   EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, DoublePacketType& b0, DoublePacketType& b1)
    676   {
    677     // FIXME not sure that's the best way to implement it!
    678     loadRhs(b+0, b0);
    679     loadRhs(b+1, b1);
    680   }
    681 
    682   // Scalar path
    683   EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsScalar& b0, RhsScalar& b1)
    684   {
    685     // FIXME not sure that's the best way to implement it!
    686     loadRhs(b+0, b0);
    687     loadRhs(b+1, b1);
    688   }
    689 
    690   // nothing special here
    691   EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacket& dest) const
    692   {
    693     dest = pload<LhsPacket>((const typename unpacket_traits<LhsPacket>::type*)(a));
    694   }
    695 
    696   EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar* a, LhsPacket& dest) const
    697   {
    698     dest = ploadu<LhsPacket>((const typename unpacket_traits<LhsPacket>::type*)(a));
    699   }
    700 
    701   EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, DoublePacketType& c, RhsPacket& /*tmp*/) const
    702   {
    703     c.first   = padd(pmul(a,b.first), c.first);
    704     c.second  = padd(pmul(a,b.second),c.second);
    705   }
    706 
    707   EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, ResPacket& c, RhsPacket& /*tmp*/) const
    708   {
    709     c = cj.pmadd(a,b,c);
    710   }
    711 
    712   EIGEN_STRONG_INLINE void acc(const Scalar& c, const Scalar& alpha, Scalar& r) const { r += alpha * c; }
    713 
    714   EIGEN_STRONG_INLINE void acc(const DoublePacketType& c, const ResPacket& alpha, ResPacket& r) const
    715   {
    716     // assemble c
    717     ResPacket tmp;
    718     if((!ConjLhs)&&(!ConjRhs))
    719     {
    720       tmp = pcplxflip(pconj(ResPacket(c.second)));
    721       tmp = padd(ResPacket(c.first),tmp);
    722     }
    723     else if((!ConjLhs)&&(ConjRhs))
    724     {
    725       tmp = pconj(pcplxflip(ResPacket(c.second)));
    726       tmp = padd(ResPacket(c.first),tmp);
    727     }
    728     else if((ConjLhs)&&(!ConjRhs))
    729     {
    730       tmp = pcplxflip(ResPacket(c.second));
    731       tmp = padd(pconj(ResPacket(c.first)),tmp);
    732     }
    733     else if((ConjLhs)&&(ConjRhs))
    734     {
    735       tmp = pcplxflip(ResPacket(c.second));
    736       tmp = psub(pconj(ResPacket(c.first)),tmp);
    737     }
    738 
    739     r = pmadd(tmp,alpha,r);
    740   }
    741 
    742 protected:
    743   conj_helper<LhsScalar,RhsScalar,ConjLhs,ConjRhs> cj;
    744 };
    745 
    746 template<typename RealScalar, bool _ConjRhs>
    747 class gebp_traits<RealScalar, std::complex<RealScalar>, false, _ConjRhs >
    748 {
    749 public:
    750   typedef std::complex<RealScalar>  Scalar;
    751   typedef RealScalar  LhsScalar;
    752   typedef Scalar      RhsScalar;
    753   typedef Scalar      ResScalar;
    754 
    755   enum {
    756     ConjLhs = false,
    757     ConjRhs = _ConjRhs,
    758     Vectorizable = packet_traits<RealScalar>::Vectorizable
    759                 && packet_traits<Scalar>::Vectorizable,
    760     LhsPacketSize = Vectorizable ? packet_traits<LhsScalar>::size : 1,
    761     RhsPacketSize = Vectorizable ? packet_traits<RhsScalar>::size : 1,
    762     ResPacketSize = Vectorizable ? packet_traits<ResScalar>::size : 1,
    763 
    764     NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS,
    765     // FIXME: should depend on NumberOfRegisters
    766     nr = 4,
    767     mr = (EIGEN_PLAIN_ENUM_MIN(16,NumberOfRegisters)/2/nr)*ResPacketSize,
    768 
    769     LhsProgress = ResPacketSize,
    770     RhsProgress = 1
    771   };
    772 
    773   typedef typename packet_traits<LhsScalar>::type  _LhsPacket;
    774   typedef typename packet_traits<RhsScalar>::type  _RhsPacket;
    775   typedef typename packet_traits<ResScalar>::type  _ResPacket;
    776 
    777   typedef typename conditional<Vectorizable,_LhsPacket,LhsScalar>::type LhsPacket;
    778   typedef typename conditional<Vectorizable,_RhsPacket,RhsScalar>::type RhsPacket;
    779   typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket;
    780 
    781   typedef ResPacket AccPacket;
    782 
    783   EIGEN_STRONG_INLINE void initAcc(AccPacket& p)
    784   {
    785     p = pset1<ResPacket>(ResScalar(0));
    786   }
    787 
    788   EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const
    789   {
    790     dest = pset1<RhsPacket>(*b);
    791   }
    792 
    793   void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1, RhsPacket& b2, RhsPacket& b3)
    794   {
    795     pbroadcast4(b, b0, b1, b2, b3);
    796   }
    797 
    798 //   EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1)
    799 //   {
    800 //     // FIXME not sure that's the best way to implement it!
    801 //     b0 = pload1<RhsPacket>(b+0);
    802 //     b1 = pload1<RhsPacket>(b+1);
    803 //   }
    804 
    805   EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacket& dest) const
    806   {
    807     dest = ploaddup<LhsPacket>(a);
    808   }
    809 
    810   EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const
    811   {
    812     eigen_internal_assert(unpacket_traits<RhsPacket>::size<=4);
    813     loadRhs(b,dest);
    814   }
    815 
    816   EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar* a, LhsPacket& dest) const
    817   {
    818     dest = ploaddup<LhsPacket>(a);
    819   }
    820 
    821   EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& tmp) const
    822   {
    823     madd_impl(a, b, c, tmp, typename conditional<Vectorizable,true_type,false_type>::type());
    824   }
    825 
    826   EIGEN_STRONG_INLINE void madd_impl(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& tmp, const true_type&) const
    827   {
    828 #ifdef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
    829     EIGEN_UNUSED_VARIABLE(tmp);
    830     c.v = pmadd(a,b.v,c.v);
    831 #else
    832     tmp = b; tmp.v = pmul(a,tmp.v); c = padd(c,tmp);
    833 #endif
    834 
    835   }
    836 
    837   EIGEN_STRONG_INLINE void madd_impl(const LhsScalar& a, const RhsScalar& b, ResScalar& c, RhsScalar& /*tmp*/, const false_type&) const
    838   {
    839     c += a * b;
    840   }
    841 
    842   EIGEN_STRONG_INLINE void acc(const AccPacket& c, const ResPacket& alpha, ResPacket& r) const
    843   {
    844     r = cj.pmadd(alpha,c,r);
    845   }
    846 
    847 protected:
    848   conj_helper<ResPacket,ResPacket,false,ConjRhs> cj;
    849 };
    850 
    851 /* optimized GEneral packed Block * packed Panel product kernel
    852  *
    853  * Mixing type logic: C += A * B
    854  *  |  A  |  B  | comments
    855  *  |real |cplx | no vectorization yet, would require to pack A with duplication
    856  *  |cplx |real | easy vectorization
    857  */
    858 template<typename LhsScalar, typename RhsScalar, typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
    859 struct gebp_kernel
    860 {
    861   typedef gebp_traits<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs> Traits;
    862   typedef typename Traits::ResScalar ResScalar;
    863   typedef typename Traits::LhsPacket LhsPacket;
    864   typedef typename Traits::RhsPacket RhsPacket;
    865   typedef typename Traits::ResPacket ResPacket;
    866   typedef typename Traits::AccPacket AccPacket;
    867 
    868   typedef gebp_traits<RhsScalar,LhsScalar,ConjugateRhs,ConjugateLhs> SwappedTraits;
    869   typedef typename SwappedTraits::ResScalar SResScalar;
    870   typedef typename SwappedTraits::LhsPacket SLhsPacket;
    871   typedef typename SwappedTraits::RhsPacket SRhsPacket;
    872   typedef typename SwappedTraits::ResPacket SResPacket;
    873   typedef typename SwappedTraits::AccPacket SAccPacket;
    874 
    875   typedef typename DataMapper::LinearMapper LinearMapper;
    876 
    877   enum {
    878     Vectorizable  = Traits::Vectorizable,
    879     LhsProgress   = Traits::LhsProgress,
    880     RhsProgress   = Traits::RhsProgress,
    881     ResPacketSize = Traits::ResPacketSize
    882   };
    883 
    884   EIGEN_DONT_INLINE
    885   void operator()(const DataMapper& res, const LhsScalar* blockA, const RhsScalar* blockB,
    886                   Index rows, Index depth, Index cols, ResScalar alpha,
    887                   Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0);
    888 };
    889 
    890 template<typename LhsScalar, typename RhsScalar, typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
    891 EIGEN_DONT_INLINE
    892 void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,ConjugateRhs>
    893   ::operator()(const DataMapper& res, const LhsScalar* blockA, const RhsScalar* blockB,
    894                Index rows, Index depth, Index cols, ResScalar alpha,
    895                Index strideA, Index strideB, Index offsetA, Index offsetB)
    896   {
    897     Traits traits;
    898     SwappedTraits straits;
    899 
    900     if(strideA==-1) strideA = depth;
    901     if(strideB==-1) strideB = depth;
    902     conj_helper<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs> cj;
    903     Index packet_cols4 = nr>=4 ? (cols/4) * 4 : 0;
    904     const Index peeled_mc3 = mr>=3*Traits::LhsProgress ? (rows/(3*LhsProgress))*(3*LhsProgress) : 0;
    905     const Index peeled_mc2 = mr>=2*Traits::LhsProgress ? peeled_mc3+((rows-peeled_mc3)/(2*LhsProgress))*(2*LhsProgress) : 0;
    906     const Index peeled_mc1 = mr>=1*Traits::LhsProgress ? (rows/(1*LhsProgress))*(1*LhsProgress) : 0;
    907     enum { pk = 8 }; // NOTE Such a large peeling factor is important for large matrices (~ +5% when >1000 on Haswell)
    908     const Index peeled_kc  = depth & ~(pk-1);
    909     const Index prefetch_res_offset = 32/sizeof(ResScalar);
    910 //     const Index depth2     = depth & ~1;
    911 
    912     //---------- Process 3 * LhsProgress rows at once ----------
    913     // This corresponds to 3*LhsProgress x nr register blocks.
    914     // Usually, make sense only with FMA
    915     if(mr>=3*Traits::LhsProgress)
    916     {
    917       // Here, the general idea is to loop on each largest micro horizontal panel of the lhs (3*Traits::LhsProgress x depth)
    918       // and on each largest micro vertical panel of the rhs (depth * nr).
    919       // Blocking sizes, i.e., 'depth' has been computed so that the micro horizontal panel of the lhs fit in L1.
    920       // However, if depth is too small, we can extend the number of rows of these horizontal panels.
    921       // This actual number of rows is computed as follow:
    922       const Index l1 = defaultL1CacheSize; // in Bytes, TODO, l1 should be passed to this function.
    923       // The max(1, ...) here is needed because we may be using blocking params larger than what our known l1 cache size
    924       // suggests we should be using: either because our known l1 cache size is inaccurate (e.g. on Android, we can only guess),
    925       // or because we are testing specific blocking sizes.
    926       const Index actual_panel_rows = (3*LhsProgress) * std::max<Index>(1,( (l1 - sizeof(ResScalar)*mr*nr - depth*nr*sizeof(RhsScalar)) / (depth * sizeof(LhsScalar) * 3*LhsProgress) ));
    927       for(Index i1=0; i1<peeled_mc3; i1+=actual_panel_rows)
    928       {
    929         const Index actual_panel_end = (std::min)(i1+actual_panel_rows, peeled_mc3);
    930         for(Index j2=0; j2<packet_cols4; j2+=nr)
    931         {
    932           for(Index i=i1; i<actual_panel_end; i+=3*LhsProgress)
    933           {
    934 
    935           // We selected a 3*Traits::LhsProgress x nr micro block of res which is entirely
    936           // stored into 3 x nr registers.
    937 
    938           const LhsScalar* blA = &blockA[i*strideA+offsetA*(3*LhsProgress)];
    939           prefetch(&blA[0]);
    940 
    941           // gets res block as register
    942           AccPacket C0, C1, C2,  C3,
    943                     C4, C5, C6,  C7,
    944                     C8, C9, C10, C11;
    945           traits.initAcc(C0);  traits.initAcc(C1);  traits.initAcc(C2);  traits.initAcc(C3);
    946           traits.initAcc(C4);  traits.initAcc(C5);  traits.initAcc(C6);  traits.initAcc(C7);
    947           traits.initAcc(C8);  traits.initAcc(C9);  traits.initAcc(C10); traits.initAcc(C11);
    948 
    949           LinearMapper r0 = res.getLinearMapper(i, j2 + 0);
    950           LinearMapper r1 = res.getLinearMapper(i, j2 + 1);
    951           LinearMapper r2 = res.getLinearMapper(i, j2 + 2);
    952           LinearMapper r3 = res.getLinearMapper(i, j2 + 3);
    953 
    954           r0.prefetch(0);
    955           r1.prefetch(0);
    956           r2.prefetch(0);
    957           r3.prefetch(0);
    958 
    959           // performs "inner" products
    960           const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr];
    961           prefetch(&blB[0]);
    962           LhsPacket A0, A1;
    963 
    964           for(Index k=0; k<peeled_kc; k+=pk)
    965           {
    966             EIGEN_ASM_COMMENT("begin gebp micro kernel 3pX4");
    967             RhsPacket B_0, T0;
    968             LhsPacket A2;
    969 
    970 #define EIGEN_GEBP_ONESTEP(K) \
    971             do { \
    972               EIGEN_ASM_COMMENT("begin step of gebp micro kernel 3pX4"); \
    973               EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
    974               internal::prefetch(blA+(3*K+16)*LhsProgress); \
    975               if (EIGEN_ARCH_ARM) { internal::prefetch(blB+(4*K+16)*RhsProgress); } /* Bug 953 */ \
    976               traits.loadLhs(&blA[(0+3*K)*LhsProgress], A0);  \
    977               traits.loadLhs(&blA[(1+3*K)*LhsProgress], A1);  \
    978               traits.loadLhs(&blA[(2+3*K)*LhsProgress], A2);  \
    979               traits.loadRhs(blB + (0+4*K)*Traits::RhsProgress, B_0); \
    980               traits.madd(A0, B_0, C0, T0); \
    981               traits.madd(A1, B_0, C4, T0); \
    982               traits.madd(A2, B_0, C8, B_0); \
    983               traits.loadRhs(blB + (1+4*K)*Traits::RhsProgress, B_0); \
    984               traits.madd(A0, B_0, C1, T0); \
    985               traits.madd(A1, B_0, C5, T0); \
    986               traits.madd(A2, B_0, C9, B_0); \
    987               traits.loadRhs(blB + (2+4*K)*Traits::RhsProgress, B_0); \
    988               traits.madd(A0, B_0, C2,  T0); \
    989               traits.madd(A1, B_0, C6,  T0); \
    990               traits.madd(A2, B_0, C10, B_0); \
    991               traits.loadRhs(blB + (3+4*K)*Traits::RhsProgress, B_0); \
    992               traits.madd(A0, B_0, C3 , T0); \
    993               traits.madd(A1, B_0, C7,  T0); \
    994               traits.madd(A2, B_0, C11, B_0); \
    995               EIGEN_ASM_COMMENT("end step of gebp micro kernel 3pX4"); \
    996             } while(false)
    997 
    998             internal::prefetch(blB);
    999             EIGEN_GEBP_ONESTEP(0);
   1000             EIGEN_GEBP_ONESTEP(1);
   1001             EIGEN_GEBP_ONESTEP(2);
   1002             EIGEN_GEBP_ONESTEP(3);
   1003             EIGEN_GEBP_ONESTEP(4);
   1004             EIGEN_GEBP_ONESTEP(5);
   1005             EIGEN_GEBP_ONESTEP(6);
   1006             EIGEN_GEBP_ONESTEP(7);
   1007 
   1008             blB += pk*4*RhsProgress;
   1009             blA += pk*3*Traits::LhsProgress;
   1010 
   1011             EIGEN_ASM_COMMENT("end gebp micro kernel 3pX4");
   1012           }
   1013           // process remaining peeled loop
   1014           for(Index k=peeled_kc; k<depth; k++)
   1015           {
   1016             RhsPacket B_0, T0;
   1017             LhsPacket A2;
   1018             EIGEN_GEBP_ONESTEP(0);
   1019             blB += 4*RhsProgress;
   1020             blA += 3*Traits::LhsProgress;
   1021           }
   1022 
   1023 #undef EIGEN_GEBP_ONESTEP
   1024 
   1025           ResPacket R0, R1, R2;
   1026           ResPacket alphav = pset1<ResPacket>(alpha);
   1027 
   1028           R0 = r0.loadPacket(0 * Traits::ResPacketSize);
   1029           R1 = r0.loadPacket(1 * Traits::ResPacketSize);
   1030           R2 = r0.loadPacket(2 * Traits::ResPacketSize);
   1031           traits.acc(C0, alphav, R0);
   1032           traits.acc(C4, alphav, R1);
   1033           traits.acc(C8, alphav, R2);
   1034           r0.storePacket(0 * Traits::ResPacketSize, R0);
   1035           r0.storePacket(1 * Traits::ResPacketSize, R1);
   1036           r0.storePacket(2 * Traits::ResPacketSize, R2);
   1037 
   1038           R0 = r1.loadPacket(0 * Traits::ResPacketSize);
   1039           R1 = r1.loadPacket(1 * Traits::ResPacketSize);
   1040           R2 = r1.loadPacket(2 * Traits::ResPacketSize);
   1041           traits.acc(C1, alphav, R0);
   1042           traits.acc(C5, alphav, R1);
   1043           traits.acc(C9, alphav, R2);
   1044           r1.storePacket(0 * Traits::ResPacketSize, R0);
   1045           r1.storePacket(1 * Traits::ResPacketSize, R1);
   1046           r1.storePacket(2 * Traits::ResPacketSize, R2);
   1047 
   1048           R0 = r2.loadPacket(0 * Traits::ResPacketSize);
   1049           R1 = r2.loadPacket(1 * Traits::ResPacketSize);
   1050           R2 = r2.loadPacket(2 * Traits::ResPacketSize);
   1051           traits.acc(C2, alphav, R0);
   1052           traits.acc(C6, alphav, R1);
   1053           traits.acc(C10, alphav, R2);
   1054           r2.storePacket(0 * Traits::ResPacketSize, R0);
   1055           r2.storePacket(1 * Traits::ResPacketSize, R1);
   1056           r2.storePacket(2 * Traits::ResPacketSize, R2);
   1057 
   1058           R0 = r3.loadPacket(0 * Traits::ResPacketSize);
   1059           R1 = r3.loadPacket(1 * Traits::ResPacketSize);
   1060           R2 = r3.loadPacket(2 * Traits::ResPacketSize);
   1061           traits.acc(C3, alphav, R0);
   1062           traits.acc(C7, alphav, R1);
   1063           traits.acc(C11, alphav, R2);
   1064           r3.storePacket(0 * Traits::ResPacketSize, R0);
   1065           r3.storePacket(1 * Traits::ResPacketSize, R1);
   1066           r3.storePacket(2 * Traits::ResPacketSize, R2);
   1067           }
   1068         }
   1069 
   1070         // Deal with remaining columns of the rhs
   1071         for(Index j2=packet_cols4; j2<cols; j2++)
   1072         {
   1073           for(Index i=i1; i<actual_panel_end; i+=3*LhsProgress)
   1074           {
   1075           // One column at a time
   1076           const LhsScalar* blA = &blockA[i*strideA+offsetA*(3*Traits::LhsProgress)];
   1077           prefetch(&blA[0]);
   1078 
   1079           // gets res block as register
   1080           AccPacket C0, C4, C8;
   1081           traits.initAcc(C0);
   1082           traits.initAcc(C4);
   1083           traits.initAcc(C8);
   1084 
   1085           LinearMapper r0 = res.getLinearMapper(i, j2);
   1086           r0.prefetch(0);
   1087 
   1088           // performs "inner" products
   1089           const RhsScalar* blB = &blockB[j2*strideB+offsetB];
   1090           LhsPacket A0, A1, A2;
   1091 
   1092           for(Index k=0; k<peeled_kc; k+=pk)
   1093           {
   1094             EIGEN_ASM_COMMENT("begin gebp micro kernel 3pX1");
   1095             RhsPacket B_0;
   1096 #define EIGEN_GEBGP_ONESTEP(K) \
   1097             do { \
   1098               EIGEN_ASM_COMMENT("begin step of gebp micro kernel 3pX1"); \
   1099               EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
   1100               traits.loadLhs(&blA[(0+3*K)*LhsProgress], A0);  \
   1101               traits.loadLhs(&blA[(1+3*K)*LhsProgress], A1);  \
   1102               traits.loadLhs(&blA[(2+3*K)*LhsProgress], A2);  \
   1103               traits.loadRhs(&blB[(0+K)*RhsProgress], B_0);   \
   1104               traits.madd(A0, B_0, C0, B_0); \
   1105               traits.madd(A1, B_0, C4, B_0); \
   1106               traits.madd(A2, B_0, C8, B_0); \
   1107               EIGEN_ASM_COMMENT("end step of gebp micro kernel 3pX1"); \
   1108             } while(false)
   1109 
   1110             EIGEN_GEBGP_ONESTEP(0);
   1111             EIGEN_GEBGP_ONESTEP(1);
   1112             EIGEN_GEBGP_ONESTEP(2);
   1113             EIGEN_GEBGP_ONESTEP(3);
   1114             EIGEN_GEBGP_ONESTEP(4);
   1115             EIGEN_GEBGP_ONESTEP(5);
   1116             EIGEN_GEBGP_ONESTEP(6);
   1117             EIGEN_GEBGP_ONESTEP(7);
   1118 
   1119             blB += pk*RhsProgress;
   1120             blA += pk*3*Traits::LhsProgress;
   1121 
   1122             EIGEN_ASM_COMMENT("end gebp micro kernel 3pX1");
   1123           }
   1124 
   1125           // process remaining peeled loop
   1126           for(Index k=peeled_kc; k<depth; k++)
   1127           {
   1128             RhsPacket B_0;
   1129             EIGEN_GEBGP_ONESTEP(0);
   1130             blB += RhsProgress;
   1131             blA += 3*Traits::LhsProgress;
   1132           }
   1133 #undef EIGEN_GEBGP_ONESTEP
   1134           ResPacket R0, R1, R2;
   1135           ResPacket alphav = pset1<ResPacket>(alpha);
   1136 
   1137           R0 = r0.loadPacket(0 * Traits::ResPacketSize);
   1138           R1 = r0.loadPacket(1 * Traits::ResPacketSize);
   1139           R2 = r0.loadPacket(2 * Traits::ResPacketSize);
   1140           traits.acc(C0, alphav, R0);
   1141           traits.acc(C4, alphav, R1);
   1142           traits.acc(C8, alphav, R2);
   1143           r0.storePacket(0 * Traits::ResPacketSize, R0);
   1144           r0.storePacket(1 * Traits::ResPacketSize, R1);
   1145           r0.storePacket(2 * Traits::ResPacketSize, R2);
   1146           }
   1147         }
   1148       }
   1149     }
   1150 
   1151     //---------- Process 2 * LhsProgress rows at once ----------
   1152     if(mr>=2*Traits::LhsProgress)
   1153     {
   1154       const Index l1 = defaultL1CacheSize; // in Bytes, TODO, l1 should be passed to this function.
   1155       // The max(1, ...) here is needed because we may be using blocking params larger than what our known l1 cache size
   1156       // suggests we should be using: either because our known l1 cache size is inaccurate (e.g. on Android, we can only guess),
   1157       // or because we are testing specific blocking sizes.
   1158       Index actual_panel_rows = (2*LhsProgress) * std::max<Index>(1,( (l1 - sizeof(ResScalar)*mr*nr - depth*nr*sizeof(RhsScalar)) / (depth * sizeof(LhsScalar) * 2*LhsProgress) ));
   1159 
   1160       for(Index i1=peeled_mc3; i1<peeled_mc2; i1+=actual_panel_rows)
   1161       {
   1162         Index actual_panel_end = (std::min)(i1+actual_panel_rows, peeled_mc2);
   1163         for(Index j2=0; j2<packet_cols4; j2+=nr)
   1164         {
   1165           for(Index i=i1; i<actual_panel_end; i+=2*LhsProgress)
   1166           {
   1167 
   1168           // We selected a 2*Traits::LhsProgress x nr micro block of res which is entirely
   1169           // stored into 2 x nr registers.
   1170 
   1171           const LhsScalar* blA = &blockA[i*strideA+offsetA*(2*Traits::LhsProgress)];
   1172           prefetch(&blA[0]);
   1173 
   1174           // gets res block as register
   1175           AccPacket C0, C1, C2, C3,
   1176                     C4, C5, C6, C7;
   1177           traits.initAcc(C0); traits.initAcc(C1); traits.initAcc(C2); traits.initAcc(C3);
   1178           traits.initAcc(C4); traits.initAcc(C5); traits.initAcc(C6); traits.initAcc(C7);
   1179 
   1180           LinearMapper r0 = res.getLinearMapper(i, j2 + 0);
   1181           LinearMapper r1 = res.getLinearMapper(i, j2 + 1);
   1182           LinearMapper r2 = res.getLinearMapper(i, j2 + 2);
   1183           LinearMapper r3 = res.getLinearMapper(i, j2 + 3);
   1184 
   1185           r0.prefetch(prefetch_res_offset);
   1186           r1.prefetch(prefetch_res_offset);
   1187           r2.prefetch(prefetch_res_offset);
   1188           r3.prefetch(prefetch_res_offset);
   1189 
   1190           // performs "inner" products
   1191           const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr];
   1192           prefetch(&blB[0]);
   1193           LhsPacket A0, A1;
   1194 
   1195           for(Index k=0; k<peeled_kc; k+=pk)
   1196           {
   1197             EIGEN_ASM_COMMENT("begin gebp micro kernel 2pX4");
   1198             RhsPacket B_0, B1, B2, B3, T0;
   1199 
   1200    #define EIGEN_GEBGP_ONESTEP(K) \
   1201             do {                                                                \
   1202               EIGEN_ASM_COMMENT("begin step of gebp micro kernel 2pX4");        \
   1203               EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
   1204               traits.loadLhs(&blA[(0+2*K)*LhsProgress], A0);                    \
   1205               traits.loadLhs(&blA[(1+2*K)*LhsProgress], A1);                    \
   1206               traits.broadcastRhs(&blB[(0+4*K)*RhsProgress], B_0, B1, B2, B3);  \
   1207               traits.madd(A0, B_0, C0, T0);                                     \
   1208               traits.madd(A1, B_0, C4, B_0);                                    \
   1209               traits.madd(A0, B1,  C1, T0);                                     \
   1210               traits.madd(A1, B1,  C5, B1);                                     \
   1211               traits.madd(A0, B2,  C2, T0);                                     \
   1212               traits.madd(A1, B2,  C6, B2);                                     \
   1213               traits.madd(A0, B3,  C3, T0);                                     \
   1214               traits.madd(A1, B3,  C7, B3);                                     \
   1215               EIGEN_ASM_COMMENT("end step of gebp micro kernel 2pX4");          \
   1216             } while(false)
   1217 
   1218             internal::prefetch(blB+(48+0));
   1219             EIGEN_GEBGP_ONESTEP(0);
   1220             EIGEN_GEBGP_ONESTEP(1);
   1221             EIGEN_GEBGP_ONESTEP(2);
   1222             EIGEN_GEBGP_ONESTEP(3);
   1223             internal::prefetch(blB+(48+16));
   1224             EIGEN_GEBGP_ONESTEP(4);
   1225             EIGEN_GEBGP_ONESTEP(5);
   1226             EIGEN_GEBGP_ONESTEP(6);
   1227             EIGEN_GEBGP_ONESTEP(7);
   1228 
   1229             blB += pk*4*RhsProgress;
   1230             blA += pk*(2*Traits::LhsProgress);
   1231 
   1232             EIGEN_ASM_COMMENT("end gebp micro kernel 2pX4");
   1233           }
   1234           // process remaining peeled loop
   1235           for(Index k=peeled_kc; k<depth; k++)
   1236           {
   1237             RhsPacket B_0, B1, B2, B3, T0;
   1238             EIGEN_GEBGP_ONESTEP(0);
   1239             blB += 4*RhsProgress;
   1240             blA += 2*Traits::LhsProgress;
   1241           }
   1242 #undef EIGEN_GEBGP_ONESTEP
   1243 
   1244           ResPacket R0, R1, R2, R3;
   1245           ResPacket alphav = pset1<ResPacket>(alpha);
   1246 
   1247           R0 = r0.loadPacket(0 * Traits::ResPacketSize);
   1248           R1 = r0.loadPacket(1 * Traits::ResPacketSize);
   1249           R2 = r1.loadPacket(0 * Traits::ResPacketSize);
   1250           R3 = r1.loadPacket(1 * Traits::ResPacketSize);
   1251           traits.acc(C0, alphav, R0);
   1252           traits.acc(C4, alphav, R1);
   1253           traits.acc(C1, alphav, R2);
   1254           traits.acc(C5, alphav, R3);
   1255           r0.storePacket(0 * Traits::ResPacketSize, R0);
   1256           r0.storePacket(1 * Traits::ResPacketSize, R1);
   1257           r1.storePacket(0 * Traits::ResPacketSize, R2);
   1258           r1.storePacket(1 * Traits::ResPacketSize, R3);
   1259 
   1260           R0 = r2.loadPacket(0 * Traits::ResPacketSize);
   1261           R1 = r2.loadPacket(1 * Traits::ResPacketSize);
   1262           R2 = r3.loadPacket(0 * Traits::ResPacketSize);
   1263           R3 = r3.loadPacket(1 * Traits::ResPacketSize);
   1264           traits.acc(C2,  alphav, R0);
   1265           traits.acc(C6,  alphav, R1);
   1266           traits.acc(C3,  alphav, R2);
   1267           traits.acc(C7,  alphav, R3);
   1268           r2.storePacket(0 * Traits::ResPacketSize, R0);
   1269           r2.storePacket(1 * Traits::ResPacketSize, R1);
   1270           r3.storePacket(0 * Traits::ResPacketSize, R2);
   1271           r3.storePacket(1 * Traits::ResPacketSize, R3);
   1272           }
   1273         }
   1274 
   1275         // Deal with remaining columns of the rhs
   1276         for(Index j2=packet_cols4; j2<cols; j2++)
   1277         {
   1278           for(Index i=i1; i<actual_panel_end; i+=2*LhsProgress)
   1279           {
   1280           // One column at a time
   1281           const LhsScalar* blA = &blockA[i*strideA+offsetA*(2*Traits::LhsProgress)];
   1282           prefetch(&blA[0]);
   1283 
   1284           // gets res block as register
   1285           AccPacket C0, C4;
   1286           traits.initAcc(C0);
   1287           traits.initAcc(C4);
   1288 
   1289           LinearMapper r0 = res.getLinearMapper(i, j2);
   1290           r0.prefetch(prefetch_res_offset);
   1291 
   1292           // performs "inner" products
   1293           const RhsScalar* blB = &blockB[j2*strideB+offsetB];
   1294           LhsPacket A0, A1;
   1295 
   1296           for(Index k=0; k<peeled_kc; k+=pk)
   1297           {
   1298             EIGEN_ASM_COMMENT("begin gebp micro kernel 2pX1");
   1299             RhsPacket B_0, B1;
   1300 
   1301 #define EIGEN_GEBGP_ONESTEP(K) \
   1302             do {                                                                  \
   1303               EIGEN_ASM_COMMENT("begin step of gebp micro kernel 2pX1");          \
   1304               EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
   1305               traits.loadLhs(&blA[(0+2*K)*LhsProgress], A0);                      \
   1306               traits.loadLhs(&blA[(1+2*K)*LhsProgress], A1);                      \
   1307               traits.loadRhs(&blB[(0+K)*RhsProgress], B_0);                       \
   1308               traits.madd(A0, B_0, C0, B1);                                       \
   1309               traits.madd(A1, B_0, C4, B_0);                                      \
   1310               EIGEN_ASM_COMMENT("end step of gebp micro kernel 2pX1");            \
   1311             } while(false)
   1312 
   1313             EIGEN_GEBGP_ONESTEP(0);
   1314             EIGEN_GEBGP_ONESTEP(1);
   1315             EIGEN_GEBGP_ONESTEP(2);
   1316             EIGEN_GEBGP_ONESTEP(3);
   1317             EIGEN_GEBGP_ONESTEP(4);
   1318             EIGEN_GEBGP_ONESTEP(5);
   1319             EIGEN_GEBGP_ONESTEP(6);
   1320             EIGEN_GEBGP_ONESTEP(7);
   1321 
   1322             blB += pk*RhsProgress;
   1323             blA += pk*2*Traits::LhsProgress;
   1324 
   1325             EIGEN_ASM_COMMENT("end gebp micro kernel 2pX1");
   1326           }
   1327 
   1328           // process remaining peeled loop
   1329           for(Index k=peeled_kc; k<depth; k++)
   1330           {
   1331             RhsPacket B_0, B1;
   1332             EIGEN_GEBGP_ONESTEP(0);
   1333             blB += RhsProgress;
   1334             blA += 2*Traits::LhsProgress;
   1335           }
   1336 #undef EIGEN_GEBGP_ONESTEP
   1337           ResPacket R0, R1;
   1338           ResPacket alphav = pset1<ResPacket>(alpha);
   1339 
   1340           R0 = r0.loadPacket(0 * Traits::ResPacketSize);
   1341           R1 = r0.loadPacket(1 * Traits::ResPacketSize);
   1342           traits.acc(C0, alphav, R0);
   1343           traits.acc(C4, alphav, R1);
   1344           r0.storePacket(0 * Traits::ResPacketSize, R0);
   1345           r0.storePacket(1 * Traits::ResPacketSize, R1);
   1346           }
   1347         }
   1348       }
   1349     }
   1350     //---------- Process 1 * LhsProgress rows at once ----------
   1351     if(mr>=1*Traits::LhsProgress)
   1352     {
   1353       // loops on each largest micro horizontal panel of lhs (1*LhsProgress x depth)
   1354       for(Index i=peeled_mc2; i<peeled_mc1; i+=1*LhsProgress)
   1355       {
   1356         // loops on each largest micro vertical panel of rhs (depth * nr)
   1357         for(Index j2=0; j2<packet_cols4; j2+=nr)
   1358         {
   1359           // We select a 1*Traits::LhsProgress x nr micro block of res which is entirely
   1360           // stored into 1 x nr registers.
   1361 
   1362           const LhsScalar* blA = &blockA[i*strideA+offsetA*(1*Traits::LhsProgress)];
   1363           prefetch(&blA[0]);
   1364 
   1365           // gets res block as register
   1366           AccPacket C0, C1, C2, C3;
   1367           traits.initAcc(C0);
   1368           traits.initAcc(C1);
   1369           traits.initAcc(C2);
   1370           traits.initAcc(C3);
   1371 
   1372           LinearMapper r0 = res.getLinearMapper(i, j2 + 0);
   1373           LinearMapper r1 = res.getLinearMapper(i, j2 + 1);
   1374           LinearMapper r2 = res.getLinearMapper(i, j2 + 2);
   1375           LinearMapper r3 = res.getLinearMapper(i, j2 + 3);
   1376 
   1377           r0.prefetch(prefetch_res_offset);
   1378           r1.prefetch(prefetch_res_offset);
   1379           r2.prefetch(prefetch_res_offset);
   1380           r3.prefetch(prefetch_res_offset);
   1381 
   1382           // performs "inner" products
   1383           const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr];
   1384           prefetch(&blB[0]);
   1385           LhsPacket A0;
   1386 
   1387           for(Index k=0; k<peeled_kc; k+=pk)
   1388           {
   1389             EIGEN_ASM_COMMENT("begin gebp micro kernel 1pX4");
   1390             RhsPacket B_0, B1, B2, B3;
   1391 
   1392 #define EIGEN_GEBGP_ONESTEP(K) \
   1393             do {                                                                \
   1394               EIGEN_ASM_COMMENT("begin step of gebp micro kernel 1pX4");        \
   1395               EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
   1396               traits.loadLhs(&blA[(0+1*K)*LhsProgress], A0);                    \
   1397               traits.broadcastRhs(&blB[(0+4*K)*RhsProgress], B_0, B1, B2, B3);  \
   1398               traits.madd(A0, B_0, C0, B_0);                                    \
   1399               traits.madd(A0, B1,  C1, B1);                                     \
   1400               traits.madd(A0, B2,  C2, B2);                                     \
   1401               traits.madd(A0, B3,  C3, B3);                                     \
   1402               EIGEN_ASM_COMMENT("end step of gebp micro kernel 1pX4");          \
   1403             } while(false)
   1404 
   1405             internal::prefetch(blB+(48+0));
   1406             EIGEN_GEBGP_ONESTEP(0);
   1407             EIGEN_GEBGP_ONESTEP(1);
   1408             EIGEN_GEBGP_ONESTEP(2);
   1409             EIGEN_GEBGP_ONESTEP(3);
   1410             internal::prefetch(blB+(48+16));
   1411             EIGEN_GEBGP_ONESTEP(4);
   1412             EIGEN_GEBGP_ONESTEP(5);
   1413             EIGEN_GEBGP_ONESTEP(6);
   1414             EIGEN_GEBGP_ONESTEP(7);
   1415 
   1416             blB += pk*4*RhsProgress;
   1417             blA += pk*1*LhsProgress;
   1418 
   1419             EIGEN_ASM_COMMENT("end gebp micro kernel 1pX4");
   1420           }
   1421           // process remaining peeled loop
   1422           for(Index k=peeled_kc; k<depth; k++)
   1423           {
   1424             RhsPacket B_0, B1, B2, B3;
   1425             EIGEN_GEBGP_ONESTEP(0);
   1426             blB += 4*RhsProgress;
   1427             blA += 1*LhsProgress;
   1428           }
   1429 #undef EIGEN_GEBGP_ONESTEP
   1430 
   1431           ResPacket R0, R1;
   1432           ResPacket alphav = pset1<ResPacket>(alpha);
   1433 
   1434           R0 = r0.loadPacket(0 * Traits::ResPacketSize);
   1435           R1 = r1.loadPacket(0 * Traits::ResPacketSize);
   1436           traits.acc(C0, alphav, R0);
   1437           traits.acc(C1,  alphav, R1);
   1438           r0.storePacket(0 * Traits::ResPacketSize, R0);
   1439           r1.storePacket(0 * Traits::ResPacketSize, R1);
   1440 
   1441           R0 = r2.loadPacket(0 * Traits::ResPacketSize);
   1442           R1 = r3.loadPacket(0 * Traits::ResPacketSize);
   1443           traits.acc(C2,  alphav, R0);
   1444           traits.acc(C3,  alphav, R1);
   1445           r2.storePacket(0 * Traits::ResPacketSize, R0);
   1446           r3.storePacket(0 * Traits::ResPacketSize, R1);
   1447         }
   1448 
   1449         // Deal with remaining columns of the rhs
   1450         for(Index j2=packet_cols4; j2<cols; j2++)
   1451         {
   1452           // One column at a time
   1453           const LhsScalar* blA = &blockA[i*strideA+offsetA*(1*Traits::LhsProgress)];
   1454           prefetch(&blA[0]);
   1455 
   1456           // gets res block as register
   1457           AccPacket C0;
   1458           traits.initAcc(C0);
   1459 
   1460           LinearMapper r0 = res.getLinearMapper(i, j2);
   1461 
   1462           // performs "inner" products
   1463           const RhsScalar* blB = &blockB[j2*strideB+offsetB];
   1464           LhsPacket A0;
   1465 
   1466           for(Index k=0; k<peeled_kc; k+=pk)
   1467           {
   1468             EIGEN_ASM_COMMENT("begin gebp micro kernel 1pX1");
   1469             RhsPacket B_0;
   1470 
   1471 #define EIGEN_GEBGP_ONESTEP(K) \
   1472             do {                                                                \
   1473               EIGEN_ASM_COMMENT("begin step of gebp micro kernel 1pX1");        \
   1474               EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
   1475               traits.loadLhs(&blA[(0+1*K)*LhsProgress], A0);                    \
   1476               traits.loadRhs(&blB[(0+K)*RhsProgress], B_0);                     \
   1477               traits.madd(A0, B_0, C0, B_0);                                    \
   1478               EIGEN_ASM_COMMENT("end step of gebp micro kernel 1pX1");          \
   1479             } while(false);
   1480 
   1481             EIGEN_GEBGP_ONESTEP(0);
   1482             EIGEN_GEBGP_ONESTEP(1);
   1483             EIGEN_GEBGP_ONESTEP(2);
   1484             EIGEN_GEBGP_ONESTEP(3);
   1485             EIGEN_GEBGP_ONESTEP(4);
   1486             EIGEN_GEBGP_ONESTEP(5);
   1487             EIGEN_GEBGP_ONESTEP(6);
   1488             EIGEN_GEBGP_ONESTEP(7);
   1489 
   1490             blB += pk*RhsProgress;
   1491             blA += pk*1*Traits::LhsProgress;
   1492 
   1493             EIGEN_ASM_COMMENT("end gebp micro kernel 1pX1");
   1494           }
   1495 
   1496           // process remaining peeled loop
   1497           for(Index k=peeled_kc; k<depth; k++)
   1498           {
   1499             RhsPacket B_0;
   1500             EIGEN_GEBGP_ONESTEP(0);
   1501             blB += RhsProgress;
   1502             blA += 1*Traits::LhsProgress;
   1503           }
   1504 #undef EIGEN_GEBGP_ONESTEP
   1505           ResPacket R0;
   1506           ResPacket alphav = pset1<ResPacket>(alpha);
   1507           R0 = r0.loadPacket(0 * Traits::ResPacketSize);
   1508           traits.acc(C0, alphav, R0);
   1509           r0.storePacket(0 * Traits::ResPacketSize, R0);
   1510         }
   1511       }
   1512     }
   1513     //---------- Process remaining rows, 1 at once ----------
   1514     if(peeled_mc1<rows)
   1515     {
   1516       // loop on each panel of the rhs
   1517       for(Index j2=0; j2<packet_cols4; j2+=nr)
   1518       {
   1519         // loop on each row of the lhs (1*LhsProgress x depth)
   1520         for(Index i=peeled_mc1; i<rows; i+=1)
   1521         {
   1522           const LhsScalar* blA = &blockA[i*strideA+offsetA];
   1523           prefetch(&blA[0]);
   1524           const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr];
   1525 
   1526           // The following piece of code wont work for 512 bit registers
   1527           // Moreover, if LhsProgress==8 it assumes that there is a half packet of the same size
   1528           // as nr (which is currently 4) for the return type.
   1529           typedef typename unpacket_traits<SResPacket>::half SResPacketHalf;
   1530           if ((SwappedTraits::LhsProgress % 4) == 0 &&
   1531               (SwappedTraits::LhsProgress <= 8) &&
   1532               (SwappedTraits::LhsProgress!=8 || unpacket_traits<SResPacketHalf>::size==nr))
   1533           {
   1534             SAccPacket C0, C1, C2, C3;
   1535             straits.initAcc(C0);
   1536             straits.initAcc(C1);
   1537             straits.initAcc(C2);
   1538             straits.initAcc(C3);
   1539 
   1540             const Index spk   = (std::max)(1,SwappedTraits::LhsProgress/4);
   1541             const Index endk  = (depth/spk)*spk;
   1542             const Index endk4 = (depth/(spk*4))*(spk*4);
   1543 
   1544             Index k=0;
   1545             for(; k<endk4; k+=4*spk)
   1546             {
   1547               SLhsPacket A0,A1;
   1548               SRhsPacket B_0,B_1;
   1549 
   1550               straits.loadLhsUnaligned(blB+0*SwappedTraits::LhsProgress, A0);
   1551               straits.loadLhsUnaligned(blB+1*SwappedTraits::LhsProgress, A1);
   1552 
   1553               straits.loadRhsQuad(blA+0*spk, B_0);
   1554               straits.loadRhsQuad(blA+1*spk, B_1);
   1555               straits.madd(A0,B_0,C0,B_0);
   1556               straits.madd(A1,B_1,C1,B_1);
   1557 
   1558               straits.loadLhsUnaligned(blB+2*SwappedTraits::LhsProgress, A0);
   1559               straits.loadLhsUnaligned(blB+3*SwappedTraits::LhsProgress, A1);
   1560               straits.loadRhsQuad(blA+2*spk, B_0);
   1561               straits.loadRhsQuad(blA+3*spk, B_1);
   1562               straits.madd(A0,B_0,C2,B_0);
   1563               straits.madd(A1,B_1,C3,B_1);
   1564 
   1565               blB += 4*SwappedTraits::LhsProgress;
   1566               blA += 4*spk;
   1567             }
   1568             C0 = padd(padd(C0,C1),padd(C2,C3));
   1569             for(; k<endk; k+=spk)
   1570             {
   1571               SLhsPacket A0;
   1572               SRhsPacket B_0;
   1573 
   1574               straits.loadLhsUnaligned(blB, A0);
   1575               straits.loadRhsQuad(blA, B_0);
   1576               straits.madd(A0,B_0,C0,B_0);
   1577 
   1578               blB += SwappedTraits::LhsProgress;
   1579               blA += spk;
   1580             }
   1581             if(SwappedTraits::LhsProgress==8)
   1582             {
   1583               // Special case where we have to first reduce the accumulation register C0
   1584               typedef typename conditional<SwappedTraits::LhsProgress>=8,typename unpacket_traits<SResPacket>::half,SResPacket>::type SResPacketHalf;
   1585               typedef typename conditional<SwappedTraits::LhsProgress>=8,typename unpacket_traits<SLhsPacket>::half,SLhsPacket>::type SLhsPacketHalf;
   1586               typedef typename conditional<SwappedTraits::LhsProgress>=8,typename unpacket_traits<SLhsPacket>::half,SRhsPacket>::type SRhsPacketHalf;
   1587               typedef typename conditional<SwappedTraits::LhsProgress>=8,typename unpacket_traits<SAccPacket>::half,SAccPacket>::type SAccPacketHalf;
   1588 
   1589               SResPacketHalf R = res.template gatherPacket<SResPacketHalf>(i, j2);
   1590               SResPacketHalf alphav = pset1<SResPacketHalf>(alpha);
   1591 
   1592               if(depth-endk>0)
   1593               {
   1594                 // We have to handle the last row of the rhs which corresponds to a half-packet
   1595                 SLhsPacketHalf a0;
   1596                 SRhsPacketHalf b0;
   1597                 straits.loadLhsUnaligned(blB, a0);
   1598                 straits.loadRhs(blA, b0);
   1599                 SAccPacketHalf c0 = predux_downto4(C0);
   1600                 straits.madd(a0,b0,c0,b0);
   1601                 straits.acc(c0, alphav, R);
   1602               }
   1603               else
   1604               {
   1605                 straits.acc(predux_downto4(C0), alphav, R);
   1606               }
   1607               res.scatterPacket(i, j2, R);
   1608             }
   1609             else
   1610             {
   1611               SResPacket R = res.template gatherPacket<SResPacket>(i, j2);
   1612               SResPacket alphav = pset1<SResPacket>(alpha);
   1613               straits.acc(C0, alphav, R);
   1614               res.scatterPacket(i, j2, R);
   1615             }
   1616           }
   1617           else // scalar path
   1618           {
   1619             // get a 1 x 4 res block as registers
   1620             ResScalar C0(0), C1(0), C2(0), C3(0);
   1621 
   1622             for(Index k=0; k<depth; k++)
   1623             {
   1624               LhsScalar A0;
   1625               RhsScalar B_0, B_1;
   1626 
   1627               A0 = blA[k];
   1628 
   1629               B_0 = blB[0];
   1630               B_1 = blB[1];
   1631               CJMADD(cj,A0,B_0,C0,  B_0);
   1632               CJMADD(cj,A0,B_1,C1,  B_1);
   1633 
   1634               B_0 = blB[2];
   1635               B_1 = blB[3];
   1636               CJMADD(cj,A0,B_0,C2,  B_0);
   1637               CJMADD(cj,A0,B_1,C3,  B_1);
   1638 
   1639               blB += 4;
   1640             }
   1641             res(i, j2 + 0) += alpha * C0;
   1642             res(i, j2 + 1) += alpha * C1;
   1643             res(i, j2 + 2) += alpha * C2;
   1644             res(i, j2 + 3) += alpha * C3;
   1645           }
   1646         }
   1647       }
   1648       // remaining columns
   1649       for(Index j2=packet_cols4; j2<cols; j2++)
   1650       {
   1651         // loop on each row of the lhs (1*LhsProgress x depth)
   1652         for(Index i=peeled_mc1; i<rows; i+=1)
   1653         {
   1654           const LhsScalar* blA = &blockA[i*strideA+offsetA];
   1655           prefetch(&blA[0]);
   1656           // gets a 1 x 1 res block as registers
   1657           ResScalar C0(0);
   1658           const RhsScalar* blB = &blockB[j2*strideB+offsetB];
   1659           for(Index k=0; k<depth; k++)
   1660           {
   1661             LhsScalar A0 = blA[k];
   1662             RhsScalar B_0 = blB[k];
   1663             CJMADD(cj, A0, B_0, C0, B_0);
   1664           }
   1665           res(i, j2) += alpha * C0;
   1666         }
   1667       }
   1668     }
   1669   }
   1670 
   1671 
   1672 #undef CJMADD
   1673 
   1674 // pack a block of the lhs
   1675 // The traversal is as follow (mr==4):
   1676 //   0  4  8 12 ...
   1677 //   1  5  9 13 ...
   1678 //   2  6 10 14 ...
   1679 //   3  7 11 15 ...
   1680 //
   1681 //  16 20 24 28 ...
   1682 //  17 21 25 29 ...
   1683 //  18 22 26 30 ...
   1684 //  19 23 27 31 ...
   1685 //
   1686 //  32 33 34 35 ...
   1687 //  36 36 38 39 ...
   1688 template<typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, bool Conjugate, bool PanelMode>
   1689 struct gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, ColMajor, Conjugate, PanelMode>
   1690 {
   1691   typedef typename DataMapper::LinearMapper LinearMapper;
   1692   EIGEN_DONT_INLINE void operator()(Scalar* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride=0, Index offset=0);
   1693 };
   1694 
   1695 template<typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, bool Conjugate, bool PanelMode>
   1696 EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, ColMajor, Conjugate, PanelMode>
   1697   ::operator()(Scalar* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset)
   1698 {
   1699   typedef typename packet_traits<Scalar>::type Packet;
   1700   enum { PacketSize = packet_traits<Scalar>::size };
   1701 
   1702   EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK LHS");
   1703   EIGEN_UNUSED_VARIABLE(stride);
   1704   EIGEN_UNUSED_VARIABLE(offset);
   1705   eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride));
   1706   eigen_assert( ((Pack1%PacketSize)==0 && Pack1<=4*PacketSize) || (Pack1<=4) );
   1707   conj_if<NumTraits<Scalar>::IsComplex && Conjugate> cj;
   1708   Index count = 0;
   1709 
   1710   const Index peeled_mc3 = Pack1>=3*PacketSize ? (rows/(3*PacketSize))*(3*PacketSize) : 0;
   1711   const Index peeled_mc2 = Pack1>=2*PacketSize ? peeled_mc3+((rows-peeled_mc3)/(2*PacketSize))*(2*PacketSize) : 0;
   1712   const Index peeled_mc1 = Pack1>=1*PacketSize ? (rows/(1*PacketSize))*(1*PacketSize) : 0;
   1713   const Index peeled_mc0 = Pack2>=1*PacketSize ? peeled_mc1
   1714                          : Pack2>1             ? (rows/Pack2)*Pack2 : 0;
   1715 
   1716   Index i=0;
   1717 
   1718   // Pack 3 packets
   1719   if(Pack1>=3*PacketSize)
   1720   {
   1721     for(; i<peeled_mc3; i+=3*PacketSize)
   1722     {
   1723       if(PanelMode) count += (3*PacketSize) * offset;
   1724 
   1725       for(Index k=0; k<depth; k++)
   1726       {
   1727         Packet A, B, C;
   1728         A = lhs.loadPacket(i+0*PacketSize, k);
   1729         B = lhs.loadPacket(i+1*PacketSize, k);
   1730         C = lhs.loadPacket(i+2*PacketSize, k);
   1731         pstore(blockA+count, cj.pconj(A)); count+=PacketSize;
   1732         pstore(blockA+count, cj.pconj(B)); count+=PacketSize;
   1733         pstore(blockA+count, cj.pconj(C)); count+=PacketSize;
   1734       }
   1735       if(PanelMode) count += (3*PacketSize) * (stride-offset-depth);
   1736     }
   1737   }
   1738   // Pack 2 packets
   1739   if(Pack1>=2*PacketSize)
   1740   {
   1741     for(; i<peeled_mc2; i+=2*PacketSize)
   1742     {
   1743       if(PanelMode) count += (2*PacketSize) * offset;
   1744 
   1745       for(Index k=0; k<depth; k++)
   1746       {
   1747         Packet A, B;
   1748         A = lhs.loadPacket(i+0*PacketSize, k);
   1749         B = lhs.loadPacket(i+1*PacketSize, k);
   1750         pstore(blockA+count, cj.pconj(A)); count+=PacketSize;
   1751         pstore(blockA+count, cj.pconj(B)); count+=PacketSize;
   1752       }
   1753       if(PanelMode) count += (2*PacketSize) * (stride-offset-depth);
   1754     }
   1755   }
   1756   // Pack 1 packets
   1757   if(Pack1>=1*PacketSize)
   1758   {
   1759     for(; i<peeled_mc1; i+=1*PacketSize)
   1760     {
   1761       if(PanelMode) count += (1*PacketSize) * offset;
   1762 
   1763       for(Index k=0; k<depth; k++)
   1764       {
   1765         Packet A;
   1766         A = lhs.loadPacket(i+0*PacketSize, k);
   1767         pstore(blockA+count, cj.pconj(A));
   1768         count+=PacketSize;
   1769       }
   1770       if(PanelMode) count += (1*PacketSize) * (stride-offset-depth);
   1771     }
   1772   }
   1773   // Pack scalars
   1774   if(Pack2<PacketSize && Pack2>1)
   1775   {
   1776     for(; i<peeled_mc0; i+=Pack2)
   1777     {
   1778       if(PanelMode) count += Pack2 * offset;
   1779 
   1780       for(Index k=0; k<depth; k++)
   1781         for(Index w=0; w<Pack2; w++)
   1782           blockA[count++] = cj(lhs(i+w, k));
   1783 
   1784       if(PanelMode) count += Pack2 * (stride-offset-depth);
   1785     }
   1786   }
   1787   for(; i<rows; i++)
   1788   {
   1789     if(PanelMode) count += offset;
   1790     for(Index k=0; k<depth; k++)
   1791       blockA[count++] = cj(lhs(i, k));
   1792     if(PanelMode) count += (stride-offset-depth);
   1793   }
   1794 }
   1795 
   1796 template<typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, bool Conjugate, bool PanelMode>
   1797 struct gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, RowMajor, Conjugate, PanelMode>
   1798 {
   1799   typedef typename DataMapper::LinearMapper LinearMapper;
   1800   EIGEN_DONT_INLINE void operator()(Scalar* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride=0, Index offset=0);
   1801 };
   1802 
   1803 template<typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, bool Conjugate, bool PanelMode>
   1804 EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, RowMajor, Conjugate, PanelMode>
   1805   ::operator()(Scalar* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset)
   1806 {
   1807   typedef typename packet_traits<Scalar>::type Packet;
   1808   enum { PacketSize = packet_traits<Scalar>::size };
   1809 
   1810   EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK LHS");
   1811   EIGEN_UNUSED_VARIABLE(stride);
   1812   EIGEN_UNUSED_VARIABLE(offset);
   1813   eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride));
   1814   conj_if<NumTraits<Scalar>::IsComplex && Conjugate> cj;
   1815   Index count = 0;
   1816 
   1817 //   const Index peeled_mc3 = Pack1>=3*PacketSize ? (rows/(3*PacketSize))*(3*PacketSize) : 0;
   1818 //   const Index peeled_mc2 = Pack1>=2*PacketSize ? peeled_mc3+((rows-peeled_mc3)/(2*PacketSize))*(2*PacketSize) : 0;
   1819 //   const Index peeled_mc1 = Pack1>=1*PacketSize ? (rows/(1*PacketSize))*(1*PacketSize) : 0;
   1820 
   1821   int pack = Pack1;
   1822   Index i = 0;
   1823   while(pack>0)
   1824   {
   1825     Index remaining_rows = rows-i;
   1826     Index peeled_mc = i+(remaining_rows/pack)*pack;
   1827     for(; i<peeled_mc; i+=pack)
   1828     {
   1829       if(PanelMode) count += pack * offset;
   1830 
   1831       const Index peeled_k = (depth/PacketSize)*PacketSize;
   1832       Index k=0;
   1833       if(pack>=PacketSize)
   1834       {
   1835         for(; k<peeled_k; k+=PacketSize)
   1836         {
   1837           for (Index m = 0; m < pack; m += PacketSize)
   1838           {
   1839             PacketBlock<Packet> kernel;
   1840             for (int p = 0; p < PacketSize; ++p) kernel.packet[p] = lhs.loadPacket(i+p+m, k);
   1841             ptranspose(kernel);
   1842             for (int p = 0; p < PacketSize; ++p) pstore(blockA+count+m+(pack)*p, cj.pconj(kernel.packet[p]));
   1843           }
   1844           count += PacketSize*pack;
   1845         }
   1846       }
   1847       for(; k<depth; k++)
   1848       {
   1849         Index w=0;
   1850         for(; w<pack-3; w+=4)
   1851         {
   1852           Scalar a(cj(lhs(i+w+0, k))),
   1853                  b(cj(lhs(i+w+1, k))),
   1854                  c(cj(lhs(i+w+2, k))),
   1855                  d(cj(lhs(i+w+3, k)));
   1856           blockA[count++] = a;
   1857           blockA[count++] = b;
   1858           blockA[count++] = c;
   1859           blockA[count++] = d;
   1860         }
   1861         if(pack%4)
   1862           for(;w<pack;++w)
   1863             blockA[count++] = cj(lhs(i+w, k));
   1864       }
   1865 
   1866       if(PanelMode) count += pack * (stride-offset-depth);
   1867     }
   1868 
   1869     pack -= PacketSize;
   1870     if(pack<Pack2 && (pack+PacketSize)!=Pack2)
   1871       pack = Pack2;
   1872   }
   1873 
   1874   for(; i<rows; i++)
   1875   {
   1876     if(PanelMode) count += offset;
   1877     for(Index k=0; k<depth; k++)
   1878       blockA[count++] = cj(lhs(i, k));
   1879     if(PanelMode) count += (stride-offset-depth);
   1880   }
   1881 }
   1882 
   1883 // copy a complete panel of the rhs
   1884 // this version is optimized for column major matrices
   1885 // The traversal order is as follow: (nr==4):
   1886 //  0  1  2  3   12 13 14 15   24 27
   1887 //  4  5  6  7   16 17 18 19   25 28
   1888 //  8  9 10 11   20 21 22 23   26 29
   1889 //  .  .  .  .    .  .  .  .    .  .
   1890 template<typename Scalar, typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
   1891 struct gemm_pack_rhs<Scalar, Index, DataMapper, nr, ColMajor, Conjugate, PanelMode>
   1892 {
   1893   typedef typename packet_traits<Scalar>::type Packet;
   1894   typedef typename DataMapper::LinearMapper LinearMapper;
   1895   enum { PacketSize = packet_traits<Scalar>::size };
   1896   EIGEN_DONT_INLINE void operator()(Scalar* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride=0, Index offset=0);
   1897 };
   1898 
   1899 template<typename Scalar, typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
   1900 EIGEN_DONT_INLINE void gemm_pack_rhs<Scalar, Index, DataMapper, nr, ColMajor, Conjugate, PanelMode>
   1901   ::operator()(Scalar* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset)
   1902 {
   1903   EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK RHS COLMAJOR");
   1904   EIGEN_UNUSED_VARIABLE(stride);
   1905   EIGEN_UNUSED_VARIABLE(offset);
   1906   eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride));
   1907   conj_if<NumTraits<Scalar>::IsComplex && Conjugate> cj;
   1908   Index packet_cols8 = nr>=8 ? (cols/8) * 8 : 0;
   1909   Index packet_cols4 = nr>=4 ? (cols/4) * 4 : 0;
   1910   Index count = 0;
   1911   const Index peeled_k = (depth/PacketSize)*PacketSize;
   1912 //   if(nr>=8)
   1913 //   {
   1914 //     for(Index j2=0; j2<packet_cols8; j2+=8)
   1915 //     {
   1916 //       // skip what we have before
   1917 //       if(PanelMode) count += 8 * offset;
   1918 //       const Scalar* b0 = &rhs[(j2+0)*rhsStride];
   1919 //       const Scalar* b1 = &rhs[(j2+1)*rhsStride];
   1920 //       const Scalar* b2 = &rhs[(j2+2)*rhsStride];
   1921 //       const Scalar* b3 = &rhs[(j2+3)*rhsStride];
   1922 //       const Scalar* b4 = &rhs[(j2+4)*rhsStride];
   1923 //       const Scalar* b5 = &rhs[(j2+5)*rhsStride];
   1924 //       const Scalar* b6 = &rhs[(j2+6)*rhsStride];
   1925 //       const Scalar* b7 = &rhs[(j2+7)*rhsStride];
   1926 //       Index k=0;
   1927 //       if(PacketSize==8) // TODO enbale vectorized transposition for PacketSize==4
   1928 //       {
   1929 //         for(; k<peeled_k; k+=PacketSize) {
   1930 //           PacketBlock<Packet> kernel;
   1931 //           for (int p = 0; p < PacketSize; ++p) {
   1932 //             kernel.packet[p] = ploadu<Packet>(&rhs[(j2+p)*rhsStride+k]);
   1933 //           }
   1934 //           ptranspose(kernel);
   1935 //           for (int p = 0; p < PacketSize; ++p) {
   1936 //             pstoreu(blockB+count, cj.pconj(kernel.packet[p]));
   1937 //             count+=PacketSize;
   1938 //           }
   1939 //         }
   1940 //       }
   1941 //       for(; k<depth; k++)
   1942 //       {
   1943 //         blockB[count+0] = cj(b0[k]);
   1944 //         blockB[count+1] = cj(b1[k]);
   1945 //         blockB[count+2] = cj(b2[k]);
   1946 //         blockB[count+3] = cj(b3[k]);
   1947 //         blockB[count+4] = cj(b4[k]);
   1948 //         blockB[count+5] = cj(b5[k]);
   1949 //         blockB[count+6] = cj(b6[k]);
   1950 //         blockB[count+7] = cj(b7[k]);
   1951 //         count += 8;
   1952 //       }
   1953 //       // skip what we have after
   1954 //       if(PanelMode) count += 8 * (stride-offset-depth);
   1955 //     }
   1956 //   }
   1957 
   1958   if(nr>=4)
   1959   {
   1960     for(Index j2=packet_cols8; j2<packet_cols4; j2+=4)
   1961     {
   1962       // skip what we have before
   1963       if(PanelMode) count += 4 * offset;
   1964       const LinearMapper dm0 = rhs.getLinearMapper(0, j2 + 0);
   1965       const LinearMapper dm1 = rhs.getLinearMapper(0, j2 + 1);
   1966       const LinearMapper dm2 = rhs.getLinearMapper(0, j2 + 2);
   1967       const LinearMapper dm3 = rhs.getLinearMapper(0, j2 + 3);
   1968 
   1969       Index k=0;
   1970       if((PacketSize%4)==0) // TODO enable vectorized transposition for PacketSize==2 ??
   1971       {
   1972         for(; k<peeled_k; k+=PacketSize) {
   1973           PacketBlock<Packet,(PacketSize%4)==0?4:PacketSize> kernel;
   1974           kernel.packet[0] = dm0.loadPacket(k);
   1975           kernel.packet[1%PacketSize] = dm1.loadPacket(k);
   1976           kernel.packet[2%PacketSize] = dm2.loadPacket(k);
   1977           kernel.packet[3%PacketSize] = dm3.loadPacket(k);
   1978           ptranspose(kernel);
   1979           pstoreu(blockB+count+0*PacketSize, cj.pconj(kernel.packet[0]));
   1980           pstoreu(blockB+count+1*PacketSize, cj.pconj(kernel.packet[1%PacketSize]));
   1981           pstoreu(blockB+count+2*PacketSize, cj.pconj(kernel.packet[2%PacketSize]));
   1982           pstoreu(blockB+count+3*PacketSize, cj.pconj(kernel.packet[3%PacketSize]));
   1983           count+=4*PacketSize;
   1984         }
   1985       }
   1986       for(; k<depth; k++)
   1987       {
   1988         blockB[count+0] = cj(dm0(k));
   1989         blockB[count+1] = cj(dm1(k));
   1990         blockB[count+2] = cj(dm2(k));
   1991         blockB[count+3] = cj(dm3(k));
   1992         count += 4;
   1993       }
   1994       // skip what we have after
   1995       if(PanelMode) count += 4 * (stride-offset-depth);
   1996     }
   1997   }
   1998 
   1999   // copy the remaining columns one at a time (nr==1)
   2000   for(Index j2=packet_cols4; j2<cols; ++j2)
   2001   {
   2002     if(PanelMode) count += offset;
   2003     const LinearMapper dm0 = rhs.getLinearMapper(0, j2);
   2004     for(Index k=0; k<depth; k++)
   2005     {
   2006       blockB[count] = cj(dm0(k));
   2007       count += 1;
   2008     }
   2009     if(PanelMode) count += (stride-offset-depth);
   2010   }
   2011 }
   2012 
   2013 // this version is optimized for row major matrices
   2014 template<typename Scalar, typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
   2015 struct gemm_pack_rhs<Scalar, Index, DataMapper, nr, RowMajor, Conjugate, PanelMode>
   2016 {
   2017   typedef typename packet_traits<Scalar>::type Packet;
   2018   typedef typename DataMapper::LinearMapper LinearMapper;
   2019   enum { PacketSize = packet_traits<Scalar>::size };
   2020   EIGEN_DONT_INLINE void operator()(Scalar* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride=0, Index offset=0);
   2021 };
   2022 
   2023 template<typename Scalar, typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
   2024 EIGEN_DONT_INLINE void gemm_pack_rhs<Scalar, Index, DataMapper, nr, RowMajor, Conjugate, PanelMode>
   2025   ::operator()(Scalar* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset)
   2026 {
   2027   EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK RHS ROWMAJOR");
   2028   EIGEN_UNUSED_VARIABLE(stride);
   2029   EIGEN_UNUSED_VARIABLE(offset);
   2030   eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride));
   2031   conj_if<NumTraits<Scalar>::IsComplex && Conjugate> cj;
   2032   Index packet_cols8 = nr>=8 ? (cols/8) * 8 : 0;
   2033   Index packet_cols4 = nr>=4 ? (cols/4) * 4 : 0;
   2034   Index count = 0;
   2035 
   2036 //   if(nr>=8)
   2037 //   {
   2038 //     for(Index j2=0; j2<packet_cols8; j2+=8)
   2039 //     {
   2040 //       // skip what we have before
   2041 //       if(PanelMode) count += 8 * offset;
   2042 //       for(Index k=0; k<depth; k++)
   2043 //       {
   2044 //         if (PacketSize==8) {
   2045 //           Packet A = ploadu<Packet>(&rhs[k*rhsStride + j2]);
   2046 //           pstoreu(blockB+count, cj.pconj(A));
   2047 //         } else if (PacketSize==4) {
   2048 //           Packet A = ploadu<Packet>(&rhs[k*rhsStride + j2]);
   2049 //           Packet B = ploadu<Packet>(&rhs[k*rhsStride + j2 + PacketSize]);
   2050 //           pstoreu(blockB+count, cj.pconj(A));
   2051 //           pstoreu(blockB+count+PacketSize, cj.pconj(B));
   2052 //         } else {
   2053 //           const Scalar* b0 = &rhs[k*rhsStride + j2];
   2054 //           blockB[count+0] = cj(b0[0]);
   2055 //           blockB[count+1] = cj(b0[1]);
   2056 //           blockB[count+2] = cj(b0[2]);
   2057 //           blockB[count+3] = cj(b0[3]);
   2058 //           blockB[count+4] = cj(b0[4]);
   2059 //           blockB[count+5] = cj(b0[5]);
   2060 //           blockB[count+6] = cj(b0[6]);
   2061 //           blockB[count+7] = cj(b0[7]);
   2062 //         }
   2063 //         count += 8;
   2064 //       }
   2065 //       // skip what we have after
   2066 //       if(PanelMode) count += 8 * (stride-offset-depth);
   2067 //     }
   2068 //   }
   2069   if(nr>=4)
   2070   {
   2071     for(Index j2=packet_cols8; j2<packet_cols4; j2+=4)
   2072     {
   2073       // skip what we have before
   2074       if(PanelMode) count += 4 * offset;
   2075       for(Index k=0; k<depth; k++)
   2076       {
   2077         if (PacketSize==4) {
   2078           Packet A = rhs.loadPacket(k, j2);
   2079           pstoreu(blockB+count, cj.pconj(A));
   2080           count += PacketSize;
   2081         } else {
   2082           const LinearMapper dm0 = rhs.getLinearMapper(k, j2);
   2083           blockB[count+0] = cj(dm0(0));
   2084           blockB[count+1] = cj(dm0(1));
   2085           blockB[count+2] = cj(dm0(2));
   2086           blockB[count+3] = cj(dm0(3));
   2087           count += 4;
   2088         }
   2089       }
   2090       // skip what we have after
   2091       if(PanelMode) count += 4 * (stride-offset-depth);
   2092     }
   2093   }
   2094   // copy the remaining columns one at a time (nr==1)
   2095   for(Index j2=packet_cols4; j2<cols; ++j2)
   2096   {
   2097     if(PanelMode) count += offset;
   2098     for(Index k=0; k<depth; k++)
   2099     {
   2100       blockB[count] = cj(rhs(k, j2));
   2101       count += 1;
   2102     }
   2103     if(PanelMode) count += stride-offset-depth;
   2104   }
   2105 }
   2106 
   2107 } // end namespace internal
   2108 
   2109 /** \returns the currently set level 1 cpu cache size (in bytes) used to estimate the ideal blocking size parameters.
   2110   * \sa setCpuCacheSize */
   2111 inline std::ptrdiff_t l1CacheSize()
   2112 {
   2113   std::ptrdiff_t l1, l2, l3;
   2114   internal::manage_caching_sizes(GetAction, &l1, &l2, &l3);
   2115   return l1;
   2116 }
   2117 
   2118 /** \returns the currently set level 2 cpu cache size (in bytes) used to estimate the ideal blocking size parameters.
   2119   * \sa setCpuCacheSize */
   2120 inline std::ptrdiff_t l2CacheSize()
   2121 {
   2122   std::ptrdiff_t l1, l2, l3;
   2123   internal::manage_caching_sizes(GetAction, &l1, &l2, &l3);
   2124   return l2;
   2125 }
   2126 
   2127 /** \returns the currently set level 3 cpu cache size (in bytes) used to estimate the ideal blocking size paramete\
   2128 rs.
   2129 * \sa setCpuCacheSize */
   2130 inline std::ptrdiff_t l3CacheSize()
   2131 {
   2132   std::ptrdiff_t l1, l2, l3;
   2133   internal::manage_caching_sizes(GetAction, &l1, &l2, &l3);
   2134   return l3;
   2135 }
   2136 
   2137 /** Set the cpu L1 and L2 cache sizes (in bytes).
   2138   * These values are use to adjust the size of the blocks
   2139   * for the algorithms working per blocks.
   2140   *
   2141   * \sa computeProductBlockingSizes */
   2142 inline void setCpuCacheSizes(std::ptrdiff_t l1, std::ptrdiff_t l2, std::ptrdiff_t l3)
   2143 {
   2144   internal::manage_caching_sizes(SetAction, &l1, &l2, &l3);
   2145 }
   2146 
   2147 } // end namespace Eigen
   2148 
   2149 #endif // EIGEN_GENERAL_BLOCK_PANEL_H
   2150