Home | History | Annotate | Download | only in products
      1 // This file is part of Eigen, a lightweight C++ template library
      2 // for linear algebra.
      3 //
      4 // Copyright (C) 2008-2009 Gael Guennebaud <gael.guennebaud (at) inria.fr>
      5 //
      6 // This Source Code Form is subject to the terms of the Mozilla
      7 // Public License v. 2.0. If a copy of the MPL was not distributed
      8 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
      9 
     10 #ifndef EIGEN_GENERAL_MATRIX_VECTOR_H
     11 #define EIGEN_GENERAL_MATRIX_VECTOR_H
     12 
     13 namespace Eigen {
     14 
     15 namespace internal {
     16 
     17 /* Optimized col-major matrix * vector product:
     18  * This algorithm processes 4 columns at onces that allows to both reduce
     19  * the number of load/stores of the result by a factor 4 and to reduce
     20  * the instruction dependency. Moreover, we know that all bands have the
     21  * same alignment pattern.
     22  *
     23  * Mixing type logic: C += alpha * A * B
     24  *  |  A  |  B  |alpha| comments
     25  *  |real |cplx |cplx | no vectorization
     26  *  |real |cplx |real | alpha is converted to a cplx when calling the run function, no vectorization
     27  *  |cplx |real |cplx | invalid, the caller has to do tmp: = A * B; C += alpha*tmp
     28  *  |cplx |real |real | optimal case, vectorization possible via real-cplx mul
     29  */
     30 template<typename Index, typename LhsScalar, bool ConjugateLhs, typename RhsScalar, bool ConjugateRhs, int Version>
     31 struct general_matrix_vector_product<Index,LhsScalar,ColMajor,ConjugateLhs,RhsScalar,ConjugateRhs,Version>
     32 {
     33 typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar;
     34 
     35 enum {
     36   Vectorizable = packet_traits<LhsScalar>::Vectorizable && packet_traits<RhsScalar>::Vectorizable
     37               && int(packet_traits<LhsScalar>::size)==int(packet_traits<RhsScalar>::size),
     38   LhsPacketSize = Vectorizable ? packet_traits<LhsScalar>::size : 1,
     39   RhsPacketSize = Vectorizable ? packet_traits<RhsScalar>::size : 1,
     40   ResPacketSize = Vectorizable ? packet_traits<ResScalar>::size : 1
     41 };
     42 
     43 typedef typename packet_traits<LhsScalar>::type  _LhsPacket;
     44 typedef typename packet_traits<RhsScalar>::type  _RhsPacket;
     45 typedef typename packet_traits<ResScalar>::type  _ResPacket;
     46 
     47 typedef typename conditional<Vectorizable,_LhsPacket,LhsScalar>::type LhsPacket;
     48 typedef typename conditional<Vectorizable,_RhsPacket,RhsScalar>::type RhsPacket;
     49 typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket;
     50 
     51 EIGEN_DONT_INLINE static void run(
     52   Index rows, Index cols,
     53   const LhsScalar* lhs, Index lhsStride,
     54   const RhsScalar* rhs, Index rhsIncr,
     55   ResScalar* res, Index
     56   #ifdef EIGEN_INTERNAL_DEBUGGING
     57     resIncr
     58   #endif
     59   , RhsScalar alpha)
     60 {
     61   eigen_internal_assert(resIncr==1);
     62   #ifdef _EIGEN_ACCUMULATE_PACKETS
     63   #error _EIGEN_ACCUMULATE_PACKETS has already been defined
     64   #endif
     65   #define _EIGEN_ACCUMULATE_PACKETS(A0,A13,A2) \
     66     pstore(&res[j], \
     67       padd(pload<ResPacket>(&res[j]), \
     68         padd( \
     69           padd(pcj.pmul(EIGEN_CAT(ploa , A0)<LhsPacket>(&lhs0[j]),    ptmp0), \
     70                   pcj.pmul(EIGEN_CAT(ploa , A13)<LhsPacket>(&lhs1[j]),   ptmp1)), \
     71           padd(pcj.pmul(EIGEN_CAT(ploa , A2)<LhsPacket>(&lhs2[j]),    ptmp2), \
     72                   pcj.pmul(EIGEN_CAT(ploa , A13)<LhsPacket>(&lhs3[j]),   ptmp3)) )))
     73 
     74   conj_helper<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs> cj;
     75   conj_helper<LhsPacket,RhsPacket,ConjugateLhs,ConjugateRhs> pcj;
     76   if(ConjugateRhs)
     77     alpha = conj(alpha);
     78 
     79   enum { AllAligned = 0, EvenAligned, FirstAligned, NoneAligned };
     80   const Index columnsAtOnce = 4;
     81   const Index peels = 2;
     82   const Index LhsPacketAlignedMask = LhsPacketSize-1;
     83   const Index ResPacketAlignedMask = ResPacketSize-1;
     84   const Index PeelAlignedMask = ResPacketSize*peels-1;
     85   const Index size = rows;
     86 
     87   // How many coeffs of the result do we have to skip to be aligned.
     88   // Here we assume data are at least aligned on the base scalar type.
     89   Index alignedStart = internal::first_aligned(res,size);
     90   Index alignedSize = ResPacketSize>1 ? alignedStart + ((size-alignedStart) & ~ResPacketAlignedMask) : 0;
     91   const Index peeledSize  = peels>1 ? alignedStart + ((alignedSize-alignedStart) & ~PeelAlignedMask) : alignedStart;
     92 
     93   const Index alignmentStep = LhsPacketSize>1 ? (LhsPacketSize - lhsStride % LhsPacketSize) & LhsPacketAlignedMask : 0;
     94   Index alignmentPattern = alignmentStep==0 ? AllAligned
     95                        : alignmentStep==(LhsPacketSize/2) ? EvenAligned
     96                        : FirstAligned;
     97 
     98   // we cannot assume the first element is aligned because of sub-matrices
     99   const Index lhsAlignmentOffset = internal::first_aligned(lhs,size);
    100 
    101   // find how many columns do we have to skip to be aligned with the result (if possible)
    102   Index skipColumns = 0;
    103   // if the data cannot be aligned (TODO add some compile time tests when possible, e.g. for floats)
    104   if( (size_t(lhs)%sizeof(LhsScalar)) || (size_t(res)%sizeof(ResScalar)) )
    105   {
    106     alignedSize = 0;
    107     alignedStart = 0;
    108   }
    109   else if (LhsPacketSize>1)
    110   {
    111     eigen_internal_assert(size_t(lhs+lhsAlignmentOffset)%sizeof(LhsPacket)==0 || size<LhsPacketSize);
    112 
    113     while (skipColumns<LhsPacketSize &&
    114           alignedStart != ((lhsAlignmentOffset + alignmentStep*skipColumns)%LhsPacketSize))
    115       ++skipColumns;
    116     if (skipColumns==LhsPacketSize)
    117     {
    118       // nothing can be aligned, no need to skip any column
    119       alignmentPattern = NoneAligned;
    120       skipColumns = 0;
    121     }
    122     else
    123     {
    124       skipColumns = (std::min)(skipColumns,cols);
    125       // note that the skiped columns are processed later.
    126     }
    127 
    128     eigen_internal_assert(  (alignmentPattern==NoneAligned)
    129                       || (skipColumns + columnsAtOnce >= cols)
    130                       || LhsPacketSize > size
    131                       || (size_t(lhs+alignedStart+lhsStride*skipColumns)%sizeof(LhsPacket))==0);
    132   }
    133   else if(Vectorizable)
    134   {
    135     alignedStart = 0;
    136     alignedSize = size;
    137     alignmentPattern = AllAligned;
    138   }
    139 
    140   Index offset1 = (FirstAligned && alignmentStep==1?3:1);
    141   Index offset3 = (FirstAligned && alignmentStep==1?1:3);
    142 
    143   Index columnBound = ((cols-skipColumns)/columnsAtOnce)*columnsAtOnce + skipColumns;
    144   for (Index i=skipColumns; i<columnBound; i+=columnsAtOnce)
    145   {
    146     RhsPacket ptmp0 = pset1<RhsPacket>(alpha*rhs[i*rhsIncr]),
    147               ptmp1 = pset1<RhsPacket>(alpha*rhs[(i+offset1)*rhsIncr]),
    148               ptmp2 = pset1<RhsPacket>(alpha*rhs[(i+2)*rhsIncr]),
    149               ptmp3 = pset1<RhsPacket>(alpha*rhs[(i+offset3)*rhsIncr]);
    150 
    151     // this helps a lot generating better binary code
    152     const LhsScalar *lhs0 = lhs + i*lhsStride,     *lhs1 = lhs + (i+offset1)*lhsStride,
    153                     *lhs2 = lhs + (i+2)*lhsStride, *lhs3 = lhs + (i+offset3)*lhsStride;
    154 
    155     if (Vectorizable)
    156     {
    157       /* explicit vectorization */
    158       // process initial unaligned coeffs
    159       for (Index j=0; j<alignedStart; ++j)
    160       {
    161         res[j] = cj.pmadd(lhs0[j], pfirst(ptmp0), res[j]);
    162         res[j] = cj.pmadd(lhs1[j], pfirst(ptmp1), res[j]);
    163         res[j] = cj.pmadd(lhs2[j], pfirst(ptmp2), res[j]);
    164         res[j] = cj.pmadd(lhs3[j], pfirst(ptmp3), res[j]);
    165       }
    166 
    167       if (alignedSize>alignedStart)
    168       {
    169         switch(alignmentPattern)
    170         {
    171           case AllAligned:
    172             for (Index j = alignedStart; j<alignedSize; j+=ResPacketSize)
    173               _EIGEN_ACCUMULATE_PACKETS(d,d,d);
    174             break;
    175           case EvenAligned:
    176             for (Index j = alignedStart; j<alignedSize; j+=ResPacketSize)
    177               _EIGEN_ACCUMULATE_PACKETS(d,du,d);
    178             break;
    179           case FirstAligned:
    180             if(peels>1)
    181             {
    182               LhsPacket A00, A01, A02, A03, A10, A11, A12, A13;
    183               ResPacket T0, T1;
    184 
    185               A01 = pload<LhsPacket>(&lhs1[alignedStart-1]);
    186               A02 = pload<LhsPacket>(&lhs2[alignedStart-2]);
    187               A03 = pload<LhsPacket>(&lhs3[alignedStart-3]);
    188 
    189               for (Index j = alignedStart; j<peeledSize; j+=peels*ResPacketSize)
    190               {
    191                 A11 = pload<LhsPacket>(&lhs1[j-1+LhsPacketSize]);  palign<1>(A01,A11);
    192                 A12 = pload<LhsPacket>(&lhs2[j-2+LhsPacketSize]);  palign<2>(A02,A12);
    193                 A13 = pload<LhsPacket>(&lhs3[j-3+LhsPacketSize]);  palign<3>(A03,A13);
    194 
    195                 A00 = pload<LhsPacket>(&lhs0[j]);
    196                 A10 = pload<LhsPacket>(&lhs0[j+LhsPacketSize]);
    197                 T0  = pcj.pmadd(A00, ptmp0, pload<ResPacket>(&res[j]));
    198                 T1  = pcj.pmadd(A10, ptmp0, pload<ResPacket>(&res[j+ResPacketSize]));
    199 
    200                 T0  = pcj.pmadd(A01, ptmp1, T0);
    201                 A01 = pload<LhsPacket>(&lhs1[j-1+2*LhsPacketSize]);  palign<1>(A11,A01);
    202                 T0  = pcj.pmadd(A02, ptmp2, T0);
    203                 A02 = pload<LhsPacket>(&lhs2[j-2+2*LhsPacketSize]);  palign<2>(A12,A02);
    204                 T0  = pcj.pmadd(A03, ptmp3, T0);
    205                 pstore(&res[j],T0);
    206                 A03 = pload<LhsPacket>(&lhs3[j-3+2*LhsPacketSize]);  palign<3>(A13,A03);
    207                 T1  = pcj.pmadd(A11, ptmp1, T1);
    208                 T1  = pcj.pmadd(A12, ptmp2, T1);
    209                 T1  = pcj.pmadd(A13, ptmp3, T1);
    210                 pstore(&res[j+ResPacketSize],T1);
    211               }
    212             }
    213             for (Index j = peeledSize; j<alignedSize; j+=ResPacketSize)
    214               _EIGEN_ACCUMULATE_PACKETS(d,du,du);
    215             break;
    216           default:
    217             for (Index j = alignedStart; j<alignedSize; j+=ResPacketSize)
    218               _EIGEN_ACCUMULATE_PACKETS(du,du,du);
    219             break;
    220         }
    221       }
    222     } // end explicit vectorization
    223 
    224     /* process remaining coeffs (or all if there is no explicit vectorization) */
    225     for (Index j=alignedSize; j<size; ++j)
    226     {
    227       res[j] = cj.pmadd(lhs0[j], pfirst(ptmp0), res[j]);
    228       res[j] = cj.pmadd(lhs1[j], pfirst(ptmp1), res[j]);
    229       res[j] = cj.pmadd(lhs2[j], pfirst(ptmp2), res[j]);
    230       res[j] = cj.pmadd(lhs3[j], pfirst(ptmp3), res[j]);
    231     }
    232   }
    233 
    234   // process remaining first and last columns (at most columnsAtOnce-1)
    235   Index end = cols;
    236   Index start = columnBound;
    237   do
    238   {
    239     for (Index k=start; k<end; ++k)
    240     {
    241       RhsPacket ptmp0 = pset1<RhsPacket>(alpha*rhs[k*rhsIncr]);
    242       const LhsScalar* lhs0 = lhs + k*lhsStride;
    243 
    244       if (Vectorizable)
    245       {
    246         /* explicit vectorization */
    247         // process first unaligned result's coeffs
    248         for (Index j=0; j<alignedStart; ++j)
    249           res[j] += cj.pmul(lhs0[j], pfirst(ptmp0));
    250         // process aligned result's coeffs
    251         if ((size_t(lhs0+alignedStart)%sizeof(LhsPacket))==0)
    252           for (Index i = alignedStart;i<alignedSize;i+=ResPacketSize)
    253             pstore(&res[i], pcj.pmadd(ploadu<LhsPacket>(&lhs0[i]), ptmp0, pload<ResPacket>(&res[i])));
    254         else
    255           for (Index i = alignedStart;i<alignedSize;i+=ResPacketSize)
    256             pstore(&res[i], pcj.pmadd(ploadu<LhsPacket>(&lhs0[i]), ptmp0, pload<ResPacket>(&res[i])));
    257       }
    258 
    259       // process remaining scalars (or all if no explicit vectorization)
    260       for (Index i=alignedSize; i<size; ++i)
    261         res[i] += cj.pmul(lhs0[i], pfirst(ptmp0));
    262     }
    263     if (skipColumns)
    264     {
    265       start = 0;
    266       end = skipColumns;
    267       skipColumns = 0;
    268     }
    269     else
    270       break;
    271   } while(Vectorizable);
    272   #undef _EIGEN_ACCUMULATE_PACKETS
    273 }
    274 };
    275 
    276 /* Optimized row-major matrix * vector product:
    277  * This algorithm processes 4 rows at onces that allows to both reduce
    278  * the number of load/stores of the result by a factor 4 and to reduce
    279  * the instruction dependency. Moreover, we know that all bands have the
    280  * same alignment pattern.
    281  *
    282  * Mixing type logic:
    283  *  - alpha is always a complex (or converted to a complex)
    284  *  - no vectorization
    285  */
    286 template<typename Index, typename LhsScalar, bool ConjugateLhs, typename RhsScalar, bool ConjugateRhs, int Version>
    287 struct general_matrix_vector_product<Index,LhsScalar,RowMajor,ConjugateLhs,RhsScalar,ConjugateRhs,Version>
    288 {
    289 typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar;
    290 
    291 enum {
    292   Vectorizable = packet_traits<LhsScalar>::Vectorizable && packet_traits<RhsScalar>::Vectorizable
    293               && int(packet_traits<LhsScalar>::size)==int(packet_traits<RhsScalar>::size),
    294   LhsPacketSize = Vectorizable ? packet_traits<LhsScalar>::size : 1,
    295   RhsPacketSize = Vectorizable ? packet_traits<RhsScalar>::size : 1,
    296   ResPacketSize = Vectorizable ? packet_traits<ResScalar>::size : 1
    297 };
    298 
    299 typedef typename packet_traits<LhsScalar>::type  _LhsPacket;
    300 typedef typename packet_traits<RhsScalar>::type  _RhsPacket;
    301 typedef typename packet_traits<ResScalar>::type  _ResPacket;
    302 
    303 typedef typename conditional<Vectorizable,_LhsPacket,LhsScalar>::type LhsPacket;
    304 typedef typename conditional<Vectorizable,_RhsPacket,RhsScalar>::type RhsPacket;
    305 typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket;
    306 
    307 EIGEN_DONT_INLINE static void run(
    308   Index rows, Index cols,
    309   const LhsScalar* lhs, Index lhsStride,
    310   const RhsScalar* rhs, Index rhsIncr,
    311   ResScalar* res, Index resIncr,
    312   ResScalar alpha)
    313 {
    314   EIGEN_UNUSED_VARIABLE(rhsIncr);
    315   eigen_internal_assert(rhsIncr==1);
    316   #ifdef _EIGEN_ACCUMULATE_PACKETS
    317   #error _EIGEN_ACCUMULATE_PACKETS has already been defined
    318   #endif
    319 
    320   #define _EIGEN_ACCUMULATE_PACKETS(A0,A13,A2) {\
    321     RhsPacket b = pload<RhsPacket>(&rhs[j]); \
    322     ptmp0 = pcj.pmadd(EIGEN_CAT(ploa,A0) <LhsPacket>(&lhs0[j]), b, ptmp0); \
    323     ptmp1 = pcj.pmadd(EIGEN_CAT(ploa,A13)<LhsPacket>(&lhs1[j]), b, ptmp1); \
    324     ptmp2 = pcj.pmadd(EIGEN_CAT(ploa,A2) <LhsPacket>(&lhs2[j]), b, ptmp2); \
    325     ptmp3 = pcj.pmadd(EIGEN_CAT(ploa,A13)<LhsPacket>(&lhs3[j]), b, ptmp3); }
    326 
    327   conj_helper<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs> cj;
    328   conj_helper<LhsPacket,RhsPacket,ConjugateLhs,ConjugateRhs> pcj;
    329 
    330   enum { AllAligned=0, EvenAligned=1, FirstAligned=2, NoneAligned=3 };
    331   const Index rowsAtOnce = 4;
    332   const Index peels = 2;
    333   const Index RhsPacketAlignedMask = RhsPacketSize-1;
    334   const Index LhsPacketAlignedMask = LhsPacketSize-1;
    335   const Index PeelAlignedMask = RhsPacketSize*peels-1;
    336   const Index depth = cols;
    337 
    338   // How many coeffs of the result do we have to skip to be aligned.
    339   // Here we assume data are at least aligned on the base scalar type
    340   // if that's not the case then vectorization is discarded, see below.
    341   Index alignedStart = internal::first_aligned(rhs, depth);
    342   Index alignedSize = RhsPacketSize>1 ? alignedStart + ((depth-alignedStart) & ~RhsPacketAlignedMask) : 0;
    343   const Index peeledSize  = peels>1 ? alignedStart + ((alignedSize-alignedStart) & ~PeelAlignedMask) : alignedStart;
    344 
    345   const Index alignmentStep = LhsPacketSize>1 ? (LhsPacketSize - lhsStride % LhsPacketSize) & LhsPacketAlignedMask : 0;
    346   Index alignmentPattern = alignmentStep==0 ? AllAligned
    347                          : alignmentStep==(LhsPacketSize/2) ? EvenAligned
    348                          : FirstAligned;
    349 
    350   // we cannot assume the first element is aligned because of sub-matrices
    351   const Index lhsAlignmentOffset = internal::first_aligned(lhs,depth);
    352 
    353   // find how many rows do we have to skip to be aligned with rhs (if possible)
    354   Index skipRows = 0;
    355   // if the data cannot be aligned (TODO add some compile time tests when possible, e.g. for floats)
    356   if( (sizeof(LhsScalar)!=sizeof(RhsScalar)) || (size_t(lhs)%sizeof(LhsScalar)) || (size_t(rhs)%sizeof(RhsScalar)) )
    357   {
    358     alignedSize = 0;
    359     alignedStart = 0;
    360   }
    361   else if (LhsPacketSize>1)
    362   {
    363     eigen_internal_assert(size_t(lhs+lhsAlignmentOffset)%sizeof(LhsPacket)==0  || depth<LhsPacketSize);
    364 
    365     while (skipRows<LhsPacketSize &&
    366            alignedStart != ((lhsAlignmentOffset + alignmentStep*skipRows)%LhsPacketSize))
    367       ++skipRows;
    368     if (skipRows==LhsPacketSize)
    369     {
    370       // nothing can be aligned, no need to skip any column
    371       alignmentPattern = NoneAligned;
    372       skipRows = 0;
    373     }
    374     else
    375     {
    376       skipRows = (std::min)(skipRows,Index(rows));
    377       // note that the skiped columns are processed later.
    378     }
    379     eigen_internal_assert(  alignmentPattern==NoneAligned
    380                       || LhsPacketSize==1
    381                       || (skipRows + rowsAtOnce >= rows)
    382                       || LhsPacketSize > depth
    383                       || (size_t(lhs+alignedStart+lhsStride*skipRows)%sizeof(LhsPacket))==0);
    384   }
    385   else if(Vectorizable)
    386   {
    387     alignedStart = 0;
    388     alignedSize = depth;
    389     alignmentPattern = AllAligned;
    390   }
    391 
    392   Index offset1 = (FirstAligned && alignmentStep==1?3:1);
    393   Index offset3 = (FirstAligned && alignmentStep==1?1:3);
    394 
    395   Index rowBound = ((rows-skipRows)/rowsAtOnce)*rowsAtOnce + skipRows;
    396   for (Index i=skipRows; i<rowBound; i+=rowsAtOnce)
    397   {
    398     EIGEN_ALIGN16 ResScalar tmp0 = ResScalar(0);
    399     ResScalar tmp1 = ResScalar(0), tmp2 = ResScalar(0), tmp3 = ResScalar(0);
    400 
    401     // this helps the compiler generating good binary code
    402     const LhsScalar *lhs0 = lhs + i*lhsStride,     *lhs1 = lhs + (i+offset1)*lhsStride,
    403                     *lhs2 = lhs + (i+2)*lhsStride, *lhs3 = lhs + (i+offset3)*lhsStride;
    404 
    405     if (Vectorizable)
    406     {
    407       /* explicit vectorization */
    408       ResPacket ptmp0 = pset1<ResPacket>(ResScalar(0)), ptmp1 = pset1<ResPacket>(ResScalar(0)),
    409                 ptmp2 = pset1<ResPacket>(ResScalar(0)), ptmp3 = pset1<ResPacket>(ResScalar(0));
    410 
    411       // process initial unaligned coeffs
    412       // FIXME this loop get vectorized by the compiler !
    413       for (Index j=0; j<alignedStart; ++j)
    414       {
    415         RhsScalar b = rhs[j];
    416         tmp0 += cj.pmul(lhs0[j],b); tmp1 += cj.pmul(lhs1[j],b);
    417         tmp2 += cj.pmul(lhs2[j],b); tmp3 += cj.pmul(lhs3[j],b);
    418       }
    419 
    420       if (alignedSize>alignedStart)
    421       {
    422         switch(alignmentPattern)
    423         {
    424           case AllAligned:
    425             for (Index j = alignedStart; j<alignedSize; j+=RhsPacketSize)
    426               _EIGEN_ACCUMULATE_PACKETS(d,d,d);
    427             break;
    428           case EvenAligned:
    429             for (Index j = alignedStart; j<alignedSize; j+=RhsPacketSize)
    430               _EIGEN_ACCUMULATE_PACKETS(d,du,d);
    431             break;
    432           case FirstAligned:
    433             if (peels>1)
    434             {
    435               /* Here we proccess 4 rows with with two peeled iterations to hide
    436                * tghe overhead of unaligned loads. Moreover unaligned loads are handled
    437                * using special shift/move operations between the two aligned packets
    438                * overlaping the desired unaligned packet. This is *much* more efficient
    439                * than basic unaligned loads.
    440                */
    441               LhsPacket A01, A02, A03, A11, A12, A13;
    442               A01 = pload<LhsPacket>(&lhs1[alignedStart-1]);
    443               A02 = pload<LhsPacket>(&lhs2[alignedStart-2]);
    444               A03 = pload<LhsPacket>(&lhs3[alignedStart-3]);
    445 
    446               for (Index j = alignedStart; j<peeledSize; j+=peels*RhsPacketSize)
    447               {
    448                 RhsPacket b = pload<RhsPacket>(&rhs[j]);
    449                 A11 = pload<LhsPacket>(&lhs1[j-1+LhsPacketSize]);  palign<1>(A01,A11);
    450                 A12 = pload<LhsPacket>(&lhs2[j-2+LhsPacketSize]);  palign<2>(A02,A12);
    451                 A13 = pload<LhsPacket>(&lhs3[j-3+LhsPacketSize]);  palign<3>(A03,A13);
    452 
    453                 ptmp0 = pcj.pmadd(pload<LhsPacket>(&lhs0[j]), b, ptmp0);
    454                 ptmp1 = pcj.pmadd(A01, b, ptmp1);
    455                 A01 = pload<LhsPacket>(&lhs1[j-1+2*LhsPacketSize]);  palign<1>(A11,A01);
    456                 ptmp2 = pcj.pmadd(A02, b, ptmp2);
    457                 A02 = pload<LhsPacket>(&lhs2[j-2+2*LhsPacketSize]);  palign<2>(A12,A02);
    458                 ptmp3 = pcj.pmadd(A03, b, ptmp3);
    459                 A03 = pload<LhsPacket>(&lhs3[j-3+2*LhsPacketSize]);  palign<3>(A13,A03);
    460 
    461                 b = pload<RhsPacket>(&rhs[j+RhsPacketSize]);
    462                 ptmp0 = pcj.pmadd(pload<LhsPacket>(&lhs0[j+LhsPacketSize]), b, ptmp0);
    463                 ptmp1 = pcj.pmadd(A11, b, ptmp1);
    464                 ptmp2 = pcj.pmadd(A12, b, ptmp2);
    465                 ptmp3 = pcj.pmadd(A13, b, ptmp3);
    466               }
    467             }
    468             for (Index j = peeledSize; j<alignedSize; j+=RhsPacketSize)
    469               _EIGEN_ACCUMULATE_PACKETS(d,du,du);
    470             break;
    471           default:
    472             for (Index j = alignedStart; j<alignedSize; j+=RhsPacketSize)
    473               _EIGEN_ACCUMULATE_PACKETS(du,du,du);
    474             break;
    475         }
    476         tmp0 += predux(ptmp0);
    477         tmp1 += predux(ptmp1);
    478         tmp2 += predux(ptmp2);
    479         tmp3 += predux(ptmp3);
    480       }
    481     } // end explicit vectorization
    482 
    483     // process remaining coeffs (or all if no explicit vectorization)
    484     // FIXME this loop get vectorized by the compiler !
    485     for (Index j=alignedSize; j<depth; ++j)
    486     {
    487       RhsScalar b = rhs[j];
    488       tmp0 += cj.pmul(lhs0[j],b); tmp1 += cj.pmul(lhs1[j],b);
    489       tmp2 += cj.pmul(lhs2[j],b); tmp3 += cj.pmul(lhs3[j],b);
    490     }
    491     res[i*resIncr]            += alpha*tmp0;
    492     res[(i+offset1)*resIncr]  += alpha*tmp1;
    493     res[(i+2)*resIncr]        += alpha*tmp2;
    494     res[(i+offset3)*resIncr]  += alpha*tmp3;
    495   }
    496 
    497   // process remaining first and last rows (at most columnsAtOnce-1)
    498   Index end = rows;
    499   Index start = rowBound;
    500   do
    501   {
    502     for (Index i=start; i<end; ++i)
    503     {
    504       EIGEN_ALIGN16 ResScalar tmp0 = ResScalar(0);
    505       ResPacket ptmp0 = pset1<ResPacket>(tmp0);
    506       const LhsScalar* lhs0 = lhs + i*lhsStride;
    507       // process first unaligned result's coeffs
    508       // FIXME this loop get vectorized by the compiler !
    509       for (Index j=0; j<alignedStart; ++j)
    510         tmp0 += cj.pmul(lhs0[j], rhs[j]);
    511 
    512       if (alignedSize>alignedStart)
    513       {
    514         // process aligned rhs coeffs
    515         if ((size_t(lhs0+alignedStart)%sizeof(LhsPacket))==0)
    516           for (Index j = alignedStart;j<alignedSize;j+=RhsPacketSize)
    517             ptmp0 = pcj.pmadd(pload<LhsPacket>(&lhs0[j]), pload<RhsPacket>(&rhs[j]), ptmp0);
    518         else
    519           for (Index j = alignedStart;j<alignedSize;j+=RhsPacketSize)
    520             ptmp0 = pcj.pmadd(ploadu<LhsPacket>(&lhs0[j]), pload<RhsPacket>(&rhs[j]), ptmp0);
    521         tmp0 += predux(ptmp0);
    522       }
    523 
    524       // process remaining scalars
    525       // FIXME this loop get vectorized by the compiler !
    526       for (Index j=alignedSize; j<depth; ++j)
    527         tmp0 += cj.pmul(lhs0[j], rhs[j]);
    528       res[i*resIncr] += alpha*tmp0;
    529     }
    530     if (skipRows)
    531     {
    532       start = 0;
    533       end = skipRows;
    534       skipRows = 0;
    535     }
    536     else
    537       break;
    538   } while(Vectorizable);
    539 
    540   #undef _EIGEN_ACCUMULATE_PACKETS
    541 }
    542 };
    543 
    544 } // end namespace internal
    545 
    546 } // end namespace Eigen
    547 
    548 #endif // EIGEN_GENERAL_MATRIX_VECTOR_H
    549