Home | History | Annotate | Download | only in products
      1 // This file is part of Eigen, a lightweight C++ template library
      2 // for linear algebra.
      3 //
      4 // Copyright (C) 2008-2009 Gael Guennebaud <gael.guennebaud (at) inria.fr>
      5 //
      6 // This Source Code Form is subject to the terms of the Mozilla
      7 // Public License v. 2.0. If a copy of the MPL was not distributed
      8 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
      9 
     10 #ifndef EIGEN_GENERAL_MATRIX_VECTOR_H
     11 #define EIGEN_GENERAL_MATRIX_VECTOR_H
     12 
     13 namespace Eigen {
     14 
     15 namespace internal {
     16 
     17 /* Optimized col-major matrix * vector product:
     18  * This algorithm processes 4 columns at onces that allows to both reduce
     19  * the number of load/stores of the result by a factor 4 and to reduce
     20  * the instruction dependency. Moreover, we know that all bands have the
     21  * same alignment pattern.
     22  *
     23  * Mixing type logic: C += alpha * A * B
     24  *  |  A  |  B  |alpha| comments
     25  *  |real |cplx |cplx | no vectorization
     26  *  |real |cplx |real | alpha is converted to a cplx when calling the run function, no vectorization
     27  *  |cplx |real |cplx | invalid, the caller has to do tmp: = A * B; C += alpha*tmp
     28  *  |cplx |real |real | optimal case, vectorization possible via real-cplx mul
     29  *
     30  * Accesses to the matrix coefficients follow the following logic:
     31  *
     32  * - if all columns have the same alignment then
     33  *   - if the columns have the same alignment as the result vector, then easy! (-> AllAligned case)
     34  *   - otherwise perform unaligned loads only (-> NoneAligned case)
     35  * - otherwise
     36  *   - if even columns have the same alignment then
     37  *     // odd columns are guaranteed to have the same alignment too
     38  *     - if even or odd columns have the same alignment as the result, then
     39  *       // for a register size of 2 scalars, this is guarantee to be the case (e.g., SSE with double)
     40  *       - perform half aligned and half unaligned loads (-> EvenAligned case)
     41  *     - otherwise perform unaligned loads only (-> NoneAligned case)
     42  *   - otherwise, if the register size is 4 scalars (e.g., SSE with float) then
     43  *     - one over 4 consecutive columns is guaranteed to be aligned with the result vector,
     44  *       perform simple aligned loads for this column and aligned loads plus re-alignment for the other. (-> FirstAligned case)
     45  *       // this re-alignment is done by the palign function implemented for SSE in Eigen/src/Core/arch/SSE/PacketMath.h
     46  *   - otherwise,
     47  *     // if we get here, this means the register size is greater than 4 (e.g., AVX with floats),
     48  *     // we currently fall back to the NoneAligned case
     49  *
     50  * The same reasoning apply for the transposed case.
     51  *
     52  * The last case (PacketSize>4) could probably be improved by generalizing the FirstAligned case, but since we do not support AVX yet...
     53  * One might also wonder why in the EvenAligned case we perform unaligned loads instead of using the aligned-loads plus re-alignment
     54  * strategy as in the FirstAligned case. The reason is that we observed that unaligned loads on a 8 byte boundary are not too slow
     55  * compared to unaligned loads on a 4 byte boundary.
     56  *
     57  */
     58 template<typename Index, typename LhsScalar, typename LhsMapper, bool ConjugateLhs, typename RhsScalar, typename RhsMapper, bool ConjugateRhs, int Version>
     59 struct general_matrix_vector_product<Index,LhsScalar,LhsMapper,ColMajor,ConjugateLhs,RhsScalar,RhsMapper,ConjugateRhs,Version>
     60 {
     61   typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar>::ReturnType ResScalar;
     62 
     63 enum {
     64   Vectorizable = packet_traits<LhsScalar>::Vectorizable && packet_traits<RhsScalar>::Vectorizable
     65               && int(packet_traits<LhsScalar>::size)==int(packet_traits<RhsScalar>::size),
     66   LhsPacketSize = Vectorizable ? packet_traits<LhsScalar>::size : 1,
     67   RhsPacketSize = Vectorizable ? packet_traits<RhsScalar>::size : 1,
     68   ResPacketSize = Vectorizable ? packet_traits<ResScalar>::size : 1
     69 };
     70 
     71 typedef typename packet_traits<LhsScalar>::type  _LhsPacket;
     72 typedef typename packet_traits<RhsScalar>::type  _RhsPacket;
     73 typedef typename packet_traits<ResScalar>::type  _ResPacket;
     74 
     75 typedef typename conditional<Vectorizable,_LhsPacket,LhsScalar>::type LhsPacket;
     76 typedef typename conditional<Vectorizable,_RhsPacket,RhsScalar>::type RhsPacket;
     77 typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket;
     78 
     79 EIGEN_DONT_INLINE static void run(
     80   Index rows, Index cols,
     81   const LhsMapper& lhs,
     82   const RhsMapper& rhs,
     83         ResScalar* res, Index resIncr,
     84   RhsScalar alpha);
     85 };
     86 
     87 template<typename Index, typename LhsScalar, typename LhsMapper, bool ConjugateLhs, typename RhsScalar, typename RhsMapper, bool ConjugateRhs, int Version>
     88 EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,LhsMapper,ColMajor,ConjugateLhs,RhsScalar,RhsMapper,ConjugateRhs,Version>::run(
     89   Index rows, Index cols,
     90   const LhsMapper& lhs,
     91   const RhsMapper& rhs,
     92         ResScalar* res, Index resIncr,
     93   RhsScalar alpha)
     94 {
     95   EIGEN_UNUSED_VARIABLE(resIncr);
     96   eigen_internal_assert(resIncr==1);
     97   #ifdef _EIGEN_ACCUMULATE_PACKETS
     98   #error _EIGEN_ACCUMULATE_PACKETS has already been defined
     99   #endif
    100   #define _EIGEN_ACCUMULATE_PACKETS(Alignment0,Alignment13,Alignment2) \
    101     pstore(&res[j], \
    102       padd(pload<ResPacket>(&res[j]), \
    103         padd( \
    104       padd(pcj.pmul(lhs0.template load<LhsPacket, Alignment0>(j),    ptmp0), \
    105       pcj.pmul(lhs1.template load<LhsPacket, Alignment13>(j),   ptmp1)),   \
    106       padd(pcj.pmul(lhs2.template load<LhsPacket, Alignment2>(j),    ptmp2), \
    107       pcj.pmul(lhs3.template load<LhsPacket, Alignment13>(j),   ptmp3)) )))
    108 
    109   typedef typename LhsMapper::VectorMapper LhsScalars;
    110 
    111   conj_helper<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs> cj;
    112   conj_helper<LhsPacket,RhsPacket,ConjugateLhs,ConjugateRhs> pcj;
    113   if(ConjugateRhs)
    114     alpha = numext::conj(alpha);
    115 
    116   enum { AllAligned = 0, EvenAligned, FirstAligned, NoneAligned };
    117   const Index columnsAtOnce = 4;
    118   const Index peels = 2;
    119   const Index LhsPacketAlignedMask = LhsPacketSize-1;
    120   const Index ResPacketAlignedMask = ResPacketSize-1;
    121 //  const Index PeelAlignedMask = ResPacketSize*peels-1;
    122   const Index size = rows;
    123 
    124   const Index lhsStride = lhs.stride();
    125 
    126   // How many coeffs of the result do we have to skip to be aligned.
    127   // Here we assume data are at least aligned on the base scalar type.
    128   Index alignedStart = internal::first_default_aligned(res,size);
    129   Index alignedSize = ResPacketSize>1 ? alignedStart + ((size-alignedStart) & ~ResPacketAlignedMask) : 0;
    130   const Index peeledSize = alignedSize - RhsPacketSize*peels - RhsPacketSize + 1;
    131 
    132   const Index alignmentStep = LhsPacketSize>1 ? (LhsPacketSize - lhsStride % LhsPacketSize) & LhsPacketAlignedMask : 0;
    133   Index alignmentPattern = alignmentStep==0 ? AllAligned
    134                        : alignmentStep==(LhsPacketSize/2) ? EvenAligned
    135                        : FirstAligned;
    136 
    137   // we cannot assume the first element is aligned because of sub-matrices
    138   const Index lhsAlignmentOffset = lhs.firstAligned(size);
    139 
    140   // find how many columns do we have to skip to be aligned with the result (if possible)
    141   Index skipColumns = 0;
    142   // if the data cannot be aligned (TODO add some compile time tests when possible, e.g. for floats)
    143   if( (lhsAlignmentOffset < 0) || (lhsAlignmentOffset == size) || (UIntPtr(res)%sizeof(ResScalar)) )
    144   {
    145     alignedSize = 0;
    146     alignedStart = 0;
    147     alignmentPattern = NoneAligned;
    148   }
    149   else if(LhsPacketSize > 4)
    150   {
    151     // TODO: extend the code to support aligned loads whenever possible when LhsPacketSize > 4.
    152     // Currently, it seems to be better to perform unaligned loads anyway
    153     alignmentPattern = NoneAligned;
    154   }
    155   else if (LhsPacketSize>1)
    156   {
    157   //    eigen_internal_assert(size_t(firstLhs+lhsAlignmentOffset)%sizeof(LhsPacket)==0 || size<LhsPacketSize);
    158 
    159     while (skipColumns<LhsPacketSize &&
    160           alignedStart != ((lhsAlignmentOffset + alignmentStep*skipColumns)%LhsPacketSize))
    161       ++skipColumns;
    162     if (skipColumns==LhsPacketSize)
    163     {
    164       // nothing can be aligned, no need to skip any column
    165       alignmentPattern = NoneAligned;
    166       skipColumns = 0;
    167     }
    168     else
    169     {
    170       skipColumns = (std::min)(skipColumns,cols);
    171       // note that the skiped columns are processed later.
    172     }
    173 
    174     /*    eigen_internal_assert(  (alignmentPattern==NoneAligned)
    175                       || (skipColumns + columnsAtOnce >= cols)
    176                       || LhsPacketSize > size
    177                       || (size_t(firstLhs+alignedStart+lhsStride*skipColumns)%sizeof(LhsPacket))==0);*/
    178   }
    179   else if(Vectorizable)
    180   {
    181     alignedStart = 0;
    182     alignedSize = size;
    183     alignmentPattern = AllAligned;
    184   }
    185 
    186   const Index offset1 = (FirstAligned && alignmentStep==1)?3:1;
    187   const Index offset3 = (FirstAligned && alignmentStep==1)?1:3;
    188 
    189   Index columnBound = ((cols-skipColumns)/columnsAtOnce)*columnsAtOnce + skipColumns;
    190   for (Index i=skipColumns; i<columnBound; i+=columnsAtOnce)
    191   {
    192     RhsPacket ptmp0 = pset1<RhsPacket>(alpha*rhs(i, 0)),
    193               ptmp1 = pset1<RhsPacket>(alpha*rhs(i+offset1, 0)),
    194               ptmp2 = pset1<RhsPacket>(alpha*rhs(i+2, 0)),
    195               ptmp3 = pset1<RhsPacket>(alpha*rhs(i+offset3, 0));
    196 
    197     // this helps a lot generating better binary code
    198     const LhsScalars lhs0 = lhs.getVectorMapper(0, i+0),   lhs1 = lhs.getVectorMapper(0, i+offset1),
    199                      lhs2 = lhs.getVectorMapper(0, i+2),   lhs3 = lhs.getVectorMapper(0, i+offset3);
    200 
    201     if (Vectorizable)
    202     {
    203       /* explicit vectorization */
    204       // process initial unaligned coeffs
    205       for (Index j=0; j<alignedStart; ++j)
    206       {
    207         res[j] = cj.pmadd(lhs0(j), pfirst(ptmp0), res[j]);
    208         res[j] = cj.pmadd(lhs1(j), pfirst(ptmp1), res[j]);
    209         res[j] = cj.pmadd(lhs2(j), pfirst(ptmp2), res[j]);
    210         res[j] = cj.pmadd(lhs3(j), pfirst(ptmp3), res[j]);
    211       }
    212 
    213       if (alignedSize>alignedStart)
    214       {
    215         switch(alignmentPattern)
    216         {
    217           case AllAligned:
    218             for (Index j = alignedStart; j<alignedSize; j+=ResPacketSize)
    219               _EIGEN_ACCUMULATE_PACKETS(Aligned,Aligned,Aligned);
    220             break;
    221           case EvenAligned:
    222             for (Index j = alignedStart; j<alignedSize; j+=ResPacketSize)
    223               _EIGEN_ACCUMULATE_PACKETS(Aligned,Unaligned,Aligned);
    224             break;
    225           case FirstAligned:
    226           {
    227             Index j = alignedStart;
    228             if(peels>1)
    229             {
    230               LhsPacket A00, A01, A02, A03, A10, A11, A12, A13;
    231               ResPacket T0, T1;
    232 
    233               A01 = lhs1.template load<LhsPacket, Aligned>(alignedStart-1);
    234               A02 = lhs2.template load<LhsPacket, Aligned>(alignedStart-2);
    235               A03 = lhs3.template load<LhsPacket, Aligned>(alignedStart-3);
    236 
    237               for (; j<peeledSize; j+=peels*ResPacketSize)
    238               {
    239                 A11 = lhs1.template load<LhsPacket, Aligned>(j-1+LhsPacketSize);  palign<1>(A01,A11);
    240                 A12 = lhs2.template load<LhsPacket, Aligned>(j-2+LhsPacketSize);  palign<2>(A02,A12);
    241                 A13 = lhs3.template load<LhsPacket, Aligned>(j-3+LhsPacketSize);  palign<3>(A03,A13);
    242 
    243                 A00 = lhs0.template load<LhsPacket, Aligned>(j);
    244                 A10 = lhs0.template load<LhsPacket, Aligned>(j+LhsPacketSize);
    245                 T0  = pcj.pmadd(A00, ptmp0, pload<ResPacket>(&res[j]));
    246                 T1  = pcj.pmadd(A10, ptmp0, pload<ResPacket>(&res[j+ResPacketSize]));
    247 
    248                 T0  = pcj.pmadd(A01, ptmp1, T0);
    249                 A01 = lhs1.template load<LhsPacket, Aligned>(j-1+2*LhsPacketSize);  palign<1>(A11,A01);
    250                 T0  = pcj.pmadd(A02, ptmp2, T0);
    251                 A02 = lhs2.template load<LhsPacket, Aligned>(j-2+2*LhsPacketSize);  palign<2>(A12,A02);
    252                 T0  = pcj.pmadd(A03, ptmp3, T0);
    253                 pstore(&res[j],T0);
    254                 A03 = lhs3.template load<LhsPacket, Aligned>(j-3+2*LhsPacketSize);  palign<3>(A13,A03);
    255                 T1  = pcj.pmadd(A11, ptmp1, T1);
    256                 T1  = pcj.pmadd(A12, ptmp2, T1);
    257                 T1  = pcj.pmadd(A13, ptmp3, T1);
    258                 pstore(&res[j+ResPacketSize],T1);
    259               }
    260             }
    261             for (; j<alignedSize; j+=ResPacketSize)
    262               _EIGEN_ACCUMULATE_PACKETS(Aligned,Unaligned,Unaligned);
    263             break;
    264           }
    265           default:
    266             for (Index j = alignedStart; j<alignedSize; j+=ResPacketSize)
    267               _EIGEN_ACCUMULATE_PACKETS(Unaligned,Unaligned,Unaligned);
    268             break;
    269         }
    270       }
    271     } // end explicit vectorization
    272 
    273     /* process remaining coeffs (or all if there is no explicit vectorization) */
    274     for (Index j=alignedSize; j<size; ++j)
    275     {
    276       res[j] = cj.pmadd(lhs0(j), pfirst(ptmp0), res[j]);
    277       res[j] = cj.pmadd(lhs1(j), pfirst(ptmp1), res[j]);
    278       res[j] = cj.pmadd(lhs2(j), pfirst(ptmp2), res[j]);
    279       res[j] = cj.pmadd(lhs3(j), pfirst(ptmp3), res[j]);
    280     }
    281   }
    282 
    283   // process remaining first and last columns (at most columnsAtOnce-1)
    284   Index end = cols;
    285   Index start = columnBound;
    286   do
    287   {
    288     for (Index k=start; k<end; ++k)
    289     {
    290       RhsPacket ptmp0 = pset1<RhsPacket>(alpha*rhs(k, 0));
    291       const LhsScalars lhs0 = lhs.getVectorMapper(0, k);
    292 
    293       if (Vectorizable)
    294       {
    295         /* explicit vectorization */
    296         // process first unaligned result's coeffs
    297         for (Index j=0; j<alignedStart; ++j)
    298           res[j] += cj.pmul(lhs0(j), pfirst(ptmp0));
    299         // process aligned result's coeffs
    300         if (lhs0.template aligned<LhsPacket>(alignedStart))
    301           for (Index i = alignedStart;i<alignedSize;i+=ResPacketSize)
    302             pstore(&res[i], pcj.pmadd(lhs0.template load<LhsPacket, Aligned>(i), ptmp0, pload<ResPacket>(&res[i])));
    303         else
    304           for (Index i = alignedStart;i<alignedSize;i+=ResPacketSize)
    305             pstore(&res[i], pcj.pmadd(lhs0.template load<LhsPacket, Unaligned>(i), ptmp0, pload<ResPacket>(&res[i])));
    306       }
    307 
    308       // process remaining scalars (or all if no explicit vectorization)
    309       for (Index i=alignedSize; i<size; ++i)
    310         res[i] += cj.pmul(lhs0(i), pfirst(ptmp0));
    311     }
    312     if (skipColumns)
    313     {
    314       start = 0;
    315       end = skipColumns;
    316       skipColumns = 0;
    317     }
    318     else
    319       break;
    320   } while(Vectorizable);
    321   #undef _EIGEN_ACCUMULATE_PACKETS
    322 }
    323 
    324 /* Optimized row-major matrix * vector product:
    325  * This algorithm processes 4 rows at onces that allows to both reduce
    326  * the number of load/stores of the result by a factor 4 and to reduce
    327  * the instruction dependency. Moreover, we know that all bands have the
    328  * same alignment pattern.
    329  *
    330  * Mixing type logic:
    331  *  - alpha is always a complex (or converted to a complex)
    332  *  - no vectorization
    333  */
    334 template<typename Index, typename LhsScalar, typename LhsMapper, bool ConjugateLhs, typename RhsScalar, typename RhsMapper, bool ConjugateRhs, int Version>
    335 struct general_matrix_vector_product<Index,LhsScalar,LhsMapper,RowMajor,ConjugateLhs,RhsScalar,RhsMapper,ConjugateRhs,Version>
    336 {
    337 typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar>::ReturnType ResScalar;
    338 
    339 enum {
    340   Vectorizable = packet_traits<LhsScalar>::Vectorizable && packet_traits<RhsScalar>::Vectorizable
    341               && int(packet_traits<LhsScalar>::size)==int(packet_traits<RhsScalar>::size),
    342   LhsPacketSize = Vectorizable ? packet_traits<LhsScalar>::size : 1,
    343   RhsPacketSize = Vectorizable ? packet_traits<RhsScalar>::size : 1,
    344   ResPacketSize = Vectorizable ? packet_traits<ResScalar>::size : 1
    345 };
    346 
    347 typedef typename packet_traits<LhsScalar>::type  _LhsPacket;
    348 typedef typename packet_traits<RhsScalar>::type  _RhsPacket;
    349 typedef typename packet_traits<ResScalar>::type  _ResPacket;
    350 
    351 typedef typename conditional<Vectorizable,_LhsPacket,LhsScalar>::type LhsPacket;
    352 typedef typename conditional<Vectorizable,_RhsPacket,RhsScalar>::type RhsPacket;
    353 typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket;
    354 
    355 EIGEN_DONT_INLINE static void run(
    356   Index rows, Index cols,
    357   const LhsMapper& lhs,
    358   const RhsMapper& rhs,
    359         ResScalar* res, Index resIncr,
    360   ResScalar alpha);
    361 };
    362 
    363 template<typename Index, typename LhsScalar, typename LhsMapper, bool ConjugateLhs, typename RhsScalar, typename RhsMapper, bool ConjugateRhs, int Version>
    364 EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,LhsMapper,RowMajor,ConjugateLhs,RhsScalar,RhsMapper,ConjugateRhs,Version>::run(
    365   Index rows, Index cols,
    366   const LhsMapper& lhs,
    367   const RhsMapper& rhs,
    368   ResScalar* res, Index resIncr,
    369   ResScalar alpha)
    370 {
    371   eigen_internal_assert(rhs.stride()==1);
    372 
    373   #ifdef _EIGEN_ACCUMULATE_PACKETS
    374   #error _EIGEN_ACCUMULATE_PACKETS has already been defined
    375   #endif
    376 
    377   #define _EIGEN_ACCUMULATE_PACKETS(Alignment0,Alignment13,Alignment2) {\
    378     RhsPacket b = rhs.getVectorMapper(j, 0).template load<RhsPacket, Aligned>(0);  \
    379     ptmp0 = pcj.pmadd(lhs0.template load<LhsPacket, Alignment0>(j), b, ptmp0); \
    380     ptmp1 = pcj.pmadd(lhs1.template load<LhsPacket, Alignment13>(j), b, ptmp1); \
    381     ptmp2 = pcj.pmadd(lhs2.template load<LhsPacket, Alignment2>(j), b, ptmp2); \
    382     ptmp3 = pcj.pmadd(lhs3.template load<LhsPacket, Alignment13>(j), b, ptmp3); }
    383 
    384   conj_helper<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs> cj;
    385   conj_helper<LhsPacket,RhsPacket,ConjugateLhs,ConjugateRhs> pcj;
    386 
    387   typedef typename LhsMapper::VectorMapper LhsScalars;
    388 
    389   enum { AllAligned=0, EvenAligned=1, FirstAligned=2, NoneAligned=3 };
    390   const Index rowsAtOnce = 4;
    391   const Index peels = 2;
    392   const Index RhsPacketAlignedMask = RhsPacketSize-1;
    393   const Index LhsPacketAlignedMask = LhsPacketSize-1;
    394   const Index depth = cols;
    395   const Index lhsStride = lhs.stride();
    396 
    397   // How many coeffs of the result do we have to skip to be aligned.
    398   // Here we assume data are at least aligned on the base scalar type
    399   // if that's not the case then vectorization is discarded, see below.
    400   Index alignedStart = rhs.firstAligned(depth);
    401   Index alignedSize = RhsPacketSize>1 ? alignedStart + ((depth-alignedStart) & ~RhsPacketAlignedMask) : 0;
    402   const Index peeledSize = alignedSize - RhsPacketSize*peels - RhsPacketSize + 1;
    403 
    404   const Index alignmentStep = LhsPacketSize>1 ? (LhsPacketSize - lhsStride % LhsPacketSize) & LhsPacketAlignedMask : 0;
    405   Index alignmentPattern = alignmentStep==0 ? AllAligned
    406                            : alignmentStep==(LhsPacketSize/2) ? EvenAligned
    407                            : FirstAligned;
    408 
    409   // we cannot assume the first element is aligned because of sub-matrices
    410   const Index lhsAlignmentOffset = lhs.firstAligned(depth);
    411   const Index rhsAlignmentOffset = rhs.firstAligned(rows);
    412 
    413   // find how many rows do we have to skip to be aligned with rhs (if possible)
    414   Index skipRows = 0;
    415   // if the data cannot be aligned (TODO add some compile time tests when possible, e.g. for floats)
    416   if( (sizeof(LhsScalar)!=sizeof(RhsScalar)) ||
    417       (lhsAlignmentOffset < 0) || (lhsAlignmentOffset == depth) ||
    418       (rhsAlignmentOffset < 0) || (rhsAlignmentOffset == rows) )
    419   {
    420     alignedSize = 0;
    421     alignedStart = 0;
    422     alignmentPattern = NoneAligned;
    423   }
    424   else if(LhsPacketSize > 4)
    425   {
    426     // TODO: extend the code to support aligned loads whenever possible when LhsPacketSize > 4.
    427     alignmentPattern = NoneAligned;
    428   }
    429   else if (LhsPacketSize>1)
    430   {
    431   //    eigen_internal_assert(size_t(firstLhs+lhsAlignmentOffset)%sizeof(LhsPacket)==0  || depth<LhsPacketSize);
    432 
    433     while (skipRows<LhsPacketSize &&
    434            alignedStart != ((lhsAlignmentOffset + alignmentStep*skipRows)%LhsPacketSize))
    435       ++skipRows;
    436     if (skipRows==LhsPacketSize)
    437     {
    438       // nothing can be aligned, no need to skip any column
    439       alignmentPattern = NoneAligned;
    440       skipRows = 0;
    441     }
    442     else
    443     {
    444       skipRows = (std::min)(skipRows,Index(rows));
    445       // note that the skiped columns are processed later.
    446     }
    447     /*    eigen_internal_assert(  alignmentPattern==NoneAligned
    448                       || LhsPacketSize==1
    449                       || (skipRows + rowsAtOnce >= rows)
    450                       || LhsPacketSize > depth
    451                       || (size_t(firstLhs+alignedStart+lhsStride*skipRows)%sizeof(LhsPacket))==0);*/
    452   }
    453   else if(Vectorizable)
    454   {
    455     alignedStart = 0;
    456     alignedSize = depth;
    457     alignmentPattern = AllAligned;
    458   }
    459 
    460   const Index offset1 = (FirstAligned && alignmentStep==1)?3:1;
    461   const Index offset3 = (FirstAligned && alignmentStep==1)?1:3;
    462 
    463   Index rowBound = ((rows-skipRows)/rowsAtOnce)*rowsAtOnce + skipRows;
    464   for (Index i=skipRows; i<rowBound; i+=rowsAtOnce)
    465   {
    466     // FIXME: what is the purpose of this EIGEN_ALIGN_DEFAULT ??
    467     EIGEN_ALIGN_MAX ResScalar tmp0 = ResScalar(0);
    468     ResScalar tmp1 = ResScalar(0), tmp2 = ResScalar(0), tmp3 = ResScalar(0);
    469 
    470     // this helps the compiler generating good binary code
    471     const LhsScalars lhs0 = lhs.getVectorMapper(i+0, 0),    lhs1 = lhs.getVectorMapper(i+offset1, 0),
    472                      lhs2 = lhs.getVectorMapper(i+2, 0),    lhs3 = lhs.getVectorMapper(i+offset3, 0);
    473 
    474     if (Vectorizable)
    475     {
    476       /* explicit vectorization */
    477       ResPacket ptmp0 = pset1<ResPacket>(ResScalar(0)), ptmp1 = pset1<ResPacket>(ResScalar(0)),
    478                 ptmp2 = pset1<ResPacket>(ResScalar(0)), ptmp3 = pset1<ResPacket>(ResScalar(0));
    479 
    480       // process initial unaligned coeffs
    481       // FIXME this loop get vectorized by the compiler !
    482       for (Index j=0; j<alignedStart; ++j)
    483       {
    484         RhsScalar b = rhs(j, 0);
    485         tmp0 += cj.pmul(lhs0(j),b); tmp1 += cj.pmul(lhs1(j),b);
    486         tmp2 += cj.pmul(lhs2(j),b); tmp3 += cj.pmul(lhs3(j),b);
    487       }
    488 
    489       if (alignedSize>alignedStart)
    490       {
    491         switch(alignmentPattern)
    492         {
    493           case AllAligned:
    494             for (Index j = alignedStart; j<alignedSize; j+=RhsPacketSize)
    495               _EIGEN_ACCUMULATE_PACKETS(Aligned,Aligned,Aligned);
    496             break;
    497           case EvenAligned:
    498             for (Index j = alignedStart; j<alignedSize; j+=RhsPacketSize)
    499               _EIGEN_ACCUMULATE_PACKETS(Aligned,Unaligned,Aligned);
    500             break;
    501           case FirstAligned:
    502           {
    503             Index j = alignedStart;
    504             if (peels>1)
    505             {
    506               /* Here we proccess 4 rows with with two peeled iterations to hide
    507                * the overhead of unaligned loads. Moreover unaligned loads are handled
    508                * using special shift/move operations between the two aligned packets
    509                * overlaping the desired unaligned packet. This is *much* more efficient
    510                * than basic unaligned loads.
    511                */
    512               LhsPacket A01, A02, A03, A11, A12, A13;
    513               A01 = lhs1.template load<LhsPacket, Aligned>(alignedStart-1);
    514               A02 = lhs2.template load<LhsPacket, Aligned>(alignedStart-2);
    515               A03 = lhs3.template load<LhsPacket, Aligned>(alignedStart-3);
    516 
    517               for (; j<peeledSize; j+=peels*RhsPacketSize)
    518               {
    519                 RhsPacket b = rhs.getVectorMapper(j, 0).template load<RhsPacket, Aligned>(0);
    520                 A11 = lhs1.template load<LhsPacket, Aligned>(j-1+LhsPacketSize);  palign<1>(A01,A11);
    521                 A12 = lhs2.template load<LhsPacket, Aligned>(j-2+LhsPacketSize);  palign<2>(A02,A12);
    522                 A13 = lhs3.template load<LhsPacket, Aligned>(j-3+LhsPacketSize);  palign<3>(A03,A13);
    523 
    524                 ptmp0 = pcj.pmadd(lhs0.template load<LhsPacket, Aligned>(j), b, ptmp0);
    525                 ptmp1 = pcj.pmadd(A01, b, ptmp1);
    526                 A01 = lhs1.template load<LhsPacket, Aligned>(j-1+2*LhsPacketSize);  palign<1>(A11,A01);
    527                 ptmp2 = pcj.pmadd(A02, b, ptmp2);
    528                 A02 = lhs2.template load<LhsPacket, Aligned>(j-2+2*LhsPacketSize);  palign<2>(A12,A02);
    529                 ptmp3 = pcj.pmadd(A03, b, ptmp3);
    530                 A03 = lhs3.template load<LhsPacket, Aligned>(j-3+2*LhsPacketSize);  palign<3>(A13,A03);
    531 
    532                 b = rhs.getVectorMapper(j+RhsPacketSize, 0).template load<RhsPacket, Aligned>(0);
    533                 ptmp0 = pcj.pmadd(lhs0.template load<LhsPacket, Aligned>(j+LhsPacketSize), b, ptmp0);
    534                 ptmp1 = pcj.pmadd(A11, b, ptmp1);
    535                 ptmp2 = pcj.pmadd(A12, b, ptmp2);
    536                 ptmp3 = pcj.pmadd(A13, b, ptmp3);
    537               }
    538             }
    539             for (; j<alignedSize; j+=RhsPacketSize)
    540               _EIGEN_ACCUMULATE_PACKETS(Aligned,Unaligned,Unaligned);
    541             break;
    542           }
    543           default:
    544             for (Index j = alignedStart; j<alignedSize; j+=RhsPacketSize)
    545               _EIGEN_ACCUMULATE_PACKETS(Unaligned,Unaligned,Unaligned);
    546             break;
    547         }
    548         tmp0 += predux(ptmp0);
    549         tmp1 += predux(ptmp1);
    550         tmp2 += predux(ptmp2);
    551         tmp3 += predux(ptmp3);
    552       }
    553     } // end explicit vectorization
    554 
    555     // process remaining coeffs (or all if no explicit vectorization)
    556     // FIXME this loop get vectorized by the compiler !
    557     for (Index j=alignedSize; j<depth; ++j)
    558     {
    559       RhsScalar b = rhs(j, 0);
    560       tmp0 += cj.pmul(lhs0(j),b); tmp1 += cj.pmul(lhs1(j),b);
    561       tmp2 += cj.pmul(lhs2(j),b); tmp3 += cj.pmul(lhs3(j),b);
    562     }
    563     res[i*resIncr]            += alpha*tmp0;
    564     res[(i+offset1)*resIncr]  += alpha*tmp1;
    565     res[(i+2)*resIncr]        += alpha*tmp2;
    566     res[(i+offset3)*resIncr]  += alpha*tmp3;
    567   }
    568 
    569   // process remaining first and last rows (at most columnsAtOnce-1)
    570   Index end = rows;
    571   Index start = rowBound;
    572   do
    573   {
    574     for (Index i=start; i<end; ++i)
    575     {
    576       EIGEN_ALIGN_MAX ResScalar tmp0 = ResScalar(0);
    577       ResPacket ptmp0 = pset1<ResPacket>(tmp0);
    578       const LhsScalars lhs0 = lhs.getVectorMapper(i, 0);
    579       // process first unaligned result's coeffs
    580       // FIXME this loop get vectorized by the compiler !
    581       for (Index j=0; j<alignedStart; ++j)
    582         tmp0 += cj.pmul(lhs0(j), rhs(j, 0));
    583 
    584       if (alignedSize>alignedStart)
    585       {
    586         // process aligned rhs coeffs
    587         if (lhs0.template aligned<LhsPacket>(alignedStart))
    588           for (Index j = alignedStart;j<alignedSize;j+=RhsPacketSize)
    589             ptmp0 = pcj.pmadd(lhs0.template load<LhsPacket, Aligned>(j), rhs.getVectorMapper(j, 0).template load<RhsPacket, Aligned>(0), ptmp0);
    590         else
    591           for (Index j = alignedStart;j<alignedSize;j+=RhsPacketSize)
    592             ptmp0 = pcj.pmadd(lhs0.template load<LhsPacket, Unaligned>(j), rhs.getVectorMapper(j, 0).template load<RhsPacket, Aligned>(0), ptmp0);
    593         tmp0 += predux(ptmp0);
    594       }
    595 
    596       // process remaining scalars
    597       // FIXME this loop get vectorized by the compiler !
    598       for (Index j=alignedSize; j<depth; ++j)
    599         tmp0 += cj.pmul(lhs0(j), rhs(j, 0));
    600       res[i*resIncr] += alpha*tmp0;
    601     }
    602     if (skipRows)
    603     {
    604       start = 0;
    605       end = skipRows;
    606       skipRows = 0;
    607     }
    608     else
    609       break;
    610   } while(Vectorizable);
    611 
    612   #undef _EIGEN_ACCUMULATE_PACKETS
    613 }
    614 
    615 } // end namespace internal
    616 
    617 } // end namespace Eigen
    618 
    619 #endif // EIGEN_GENERAL_MATRIX_VECTOR_H
    620