Home | History | Annotate | Download | only in Core
      1 // This file is part of Eigen, a lightweight C++ template library
      2 // for linear algebra.
      3 //
      4 // Copyright (C) 2011 Benoit Jacob <jacob.benoit.1 (at) gmail.com>
      5 // Copyright (C) 2011-2014 Gael Guennebaud <gael.guennebaud (at) inria.fr>
      6 // Copyright (C) 2011-2012 Jitse Niesen <jitse (at) maths.leeds.ac.uk>
      7 //
      8 // This Source Code Form is subject to the terms of the Mozilla
      9 // Public License v. 2.0. If a copy of the MPL was not distributed
     10 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
     11 
     12 #ifndef EIGEN_ASSIGN_EVALUATOR_H
     13 #define EIGEN_ASSIGN_EVALUATOR_H
     14 
     15 namespace Eigen {
     16 
     17 // This implementation is based on Assign.h
     18 
     19 namespace internal {
     20 
     21 /***************************************************************************
     22 * Part 1 : the logic deciding a strategy for traversal and unrolling       *
     23 ***************************************************************************/
     24 
     25 // copy_using_evaluator_traits is based on assign_traits
     26 
     27 template <typename DstEvaluator, typename SrcEvaluator, typename AssignFunc>
     28 struct copy_using_evaluator_traits
     29 {
     30   typedef typename DstEvaluator::XprType Dst;
     31   typedef typename Dst::Scalar DstScalar;
     32 
     33   enum {
     34     DstFlags = DstEvaluator::Flags,
     35     SrcFlags = SrcEvaluator::Flags
     36   };
     37 
     38 public:
     39   enum {
     40     DstAlignment = DstEvaluator::Alignment,
     41     SrcAlignment = SrcEvaluator::Alignment,
     42     DstHasDirectAccess = DstFlags & DirectAccessBit,
     43     JointAlignment = EIGEN_PLAIN_ENUM_MIN(DstAlignment,SrcAlignment)
     44   };
     45 
     46 private:
     47   enum {
     48     InnerSize = int(Dst::IsVectorAtCompileTime) ? int(Dst::SizeAtCompileTime)
     49               : int(DstFlags)&RowMajorBit ? int(Dst::ColsAtCompileTime)
     50               : int(Dst::RowsAtCompileTime),
     51     InnerMaxSize = int(Dst::IsVectorAtCompileTime) ? int(Dst::MaxSizeAtCompileTime)
     52               : int(DstFlags)&RowMajorBit ? int(Dst::MaxColsAtCompileTime)
     53               : int(Dst::MaxRowsAtCompileTime),
     54     OuterStride = int(outer_stride_at_compile_time<Dst>::ret),
     55     MaxSizeAtCompileTime = Dst::SizeAtCompileTime
     56   };
     57 
     58   // TODO distinguish between linear traversal and inner-traversals
     59   typedef typename find_best_packet<DstScalar,Dst::SizeAtCompileTime>::type LinearPacketType;
     60   typedef typename find_best_packet<DstScalar,InnerSize>::type InnerPacketType;
     61 
     62   enum {
     63     LinearPacketSize = unpacket_traits<LinearPacketType>::size,
     64     InnerPacketSize = unpacket_traits<InnerPacketType>::size
     65   };
     66 
     67 public:
     68   enum {
     69     LinearRequiredAlignment = unpacket_traits<LinearPacketType>::alignment,
     70     InnerRequiredAlignment = unpacket_traits<InnerPacketType>::alignment
     71   };
     72 
     73 private:
     74   enum {
     75     DstIsRowMajor = DstFlags&RowMajorBit,
     76     SrcIsRowMajor = SrcFlags&RowMajorBit,
     77     StorageOrdersAgree = (int(DstIsRowMajor) == int(SrcIsRowMajor)),
     78     MightVectorize = bool(StorageOrdersAgree)
     79                   && (int(DstFlags) & int(SrcFlags) & ActualPacketAccessBit)
     80                   && bool(functor_traits<AssignFunc>::PacketAccess),
     81     MayInnerVectorize  = MightVectorize
     82                        && int(InnerSize)!=Dynamic && int(InnerSize)%int(InnerPacketSize)==0
     83                        && int(OuterStride)!=Dynamic && int(OuterStride)%int(InnerPacketSize)==0
     84                        && (EIGEN_UNALIGNED_VECTORIZE  || int(JointAlignment)>=int(InnerRequiredAlignment)),
     85     MayLinearize = bool(StorageOrdersAgree) && (int(DstFlags) & int(SrcFlags) & LinearAccessBit),
     86     MayLinearVectorize = bool(MightVectorize) && MayLinearize && DstHasDirectAccess
     87                        && (EIGEN_UNALIGNED_VECTORIZE || (int(DstAlignment)>=int(LinearRequiredAlignment)) || MaxSizeAtCompileTime == Dynamic),
     88       /* If the destination isn't aligned, we have to do runtime checks and we don't unroll,
     89          so it's only good for large enough sizes. */
     90     MaySliceVectorize  = bool(MightVectorize) && bool(DstHasDirectAccess)
     91                        && (int(InnerMaxSize)==Dynamic || int(InnerMaxSize)>=(EIGEN_UNALIGNED_VECTORIZE?InnerPacketSize:(3*InnerPacketSize)))
     92       /* slice vectorization can be slow, so we only want it if the slices are big, which is
     93          indicated by InnerMaxSize rather than InnerSize, think of the case of a dynamic block
     94          in a fixed-size matrix
     95          However, with EIGEN_UNALIGNED_VECTORIZE and unrolling, slice vectorization is still worth it */
     96   };
     97 
     98 public:
     99   enum {
    100     Traversal = int(MayLinearVectorize) && (LinearPacketSize>InnerPacketSize) ? int(LinearVectorizedTraversal)
    101               : int(MayInnerVectorize)   ? int(InnerVectorizedTraversal)
    102               : int(MayLinearVectorize)  ? int(LinearVectorizedTraversal)
    103               : int(MaySliceVectorize)   ? int(SliceVectorizedTraversal)
    104               : int(MayLinearize)        ? int(LinearTraversal)
    105                                          : int(DefaultTraversal),
    106     Vectorized = int(Traversal) == InnerVectorizedTraversal
    107               || int(Traversal) == LinearVectorizedTraversal
    108               || int(Traversal) == SliceVectorizedTraversal
    109   };
    110 
    111   typedef typename conditional<int(Traversal)==LinearVectorizedTraversal, LinearPacketType, InnerPacketType>::type PacketType;
    112 
    113 private:
    114   enum {
    115     ActualPacketSize    = int(Traversal)==LinearVectorizedTraversal ? LinearPacketSize
    116                         : Vectorized ? InnerPacketSize
    117                         : 1,
    118     UnrollingLimit      = EIGEN_UNROLLING_LIMIT * ActualPacketSize,
    119     MayUnrollCompletely = int(Dst::SizeAtCompileTime) != Dynamic
    120                        && int(Dst::SizeAtCompileTime) * (int(DstEvaluator::CoeffReadCost)+int(SrcEvaluator::CoeffReadCost)) <= int(UnrollingLimit),
    121     MayUnrollInner      = int(InnerSize) != Dynamic
    122                        && int(InnerSize) * (int(DstEvaluator::CoeffReadCost)+int(SrcEvaluator::CoeffReadCost)) <= int(UnrollingLimit)
    123   };
    124 
    125 public:
    126   enum {
    127     Unrolling = (int(Traversal) == int(InnerVectorizedTraversal) || int(Traversal) == int(DefaultTraversal))
    128                 ? (
    129                     int(MayUnrollCompletely) ? int(CompleteUnrolling)
    130                   : int(MayUnrollInner)      ? int(InnerUnrolling)
    131                                              : int(NoUnrolling)
    132                   )
    133               : int(Traversal) == int(LinearVectorizedTraversal)
    134                 ? ( bool(MayUnrollCompletely) && ( EIGEN_UNALIGNED_VECTORIZE || (int(DstAlignment)>=int(LinearRequiredAlignment)))
    135                           ? int(CompleteUnrolling)
    136                           : int(NoUnrolling) )
    137               : int(Traversal) == int(LinearTraversal)
    138                 ? ( bool(MayUnrollCompletely) ? int(CompleteUnrolling)
    139                                               : int(NoUnrolling) )
    140 #if EIGEN_UNALIGNED_VECTORIZE
    141               : int(Traversal) == int(SliceVectorizedTraversal)
    142                 ? ( bool(MayUnrollInner) ? int(InnerUnrolling)
    143                                          : int(NoUnrolling) )
    144 #endif
    145               : int(NoUnrolling)
    146   };
    147 
    148 #ifdef EIGEN_DEBUG_ASSIGN
    149   static void debug()
    150   {
    151     std::cerr << "DstXpr: " << typeid(typename DstEvaluator::XprType).name() << std::endl;
    152     std::cerr << "SrcXpr: " << typeid(typename SrcEvaluator::XprType).name() << std::endl;
    153     std::cerr.setf(std::ios::hex, std::ios::basefield);
    154     std::cerr << "DstFlags" << " = " << DstFlags << " (" << demangle_flags(DstFlags) << " )" << std::endl;
    155     std::cerr << "SrcFlags" << " = " << SrcFlags << " (" << demangle_flags(SrcFlags) << " )" << std::endl;
    156     std::cerr.unsetf(std::ios::hex);
    157     EIGEN_DEBUG_VAR(DstAlignment)
    158     EIGEN_DEBUG_VAR(SrcAlignment)
    159     EIGEN_DEBUG_VAR(LinearRequiredAlignment)
    160     EIGEN_DEBUG_VAR(InnerRequiredAlignment)
    161     EIGEN_DEBUG_VAR(JointAlignment)
    162     EIGEN_DEBUG_VAR(InnerSize)
    163     EIGEN_DEBUG_VAR(InnerMaxSize)
    164     EIGEN_DEBUG_VAR(LinearPacketSize)
    165     EIGEN_DEBUG_VAR(InnerPacketSize)
    166     EIGEN_DEBUG_VAR(ActualPacketSize)
    167     EIGEN_DEBUG_VAR(StorageOrdersAgree)
    168     EIGEN_DEBUG_VAR(MightVectorize)
    169     EIGEN_DEBUG_VAR(MayLinearize)
    170     EIGEN_DEBUG_VAR(MayInnerVectorize)
    171     EIGEN_DEBUG_VAR(MayLinearVectorize)
    172     EIGEN_DEBUG_VAR(MaySliceVectorize)
    173     std::cerr << "Traversal" << " = " << Traversal << " (" << demangle_traversal(Traversal) << ")" << std::endl;
    174     EIGEN_DEBUG_VAR(SrcEvaluator::CoeffReadCost)
    175     EIGEN_DEBUG_VAR(UnrollingLimit)
    176     EIGEN_DEBUG_VAR(MayUnrollCompletely)
    177     EIGEN_DEBUG_VAR(MayUnrollInner)
    178     std::cerr << "Unrolling" << " = " << Unrolling << " (" << demangle_unrolling(Unrolling) << ")" << std::endl;
    179     std::cerr << std::endl;
    180   }
    181 #endif
    182 };
    183 
    184 /***************************************************************************
    185 * Part 2 : meta-unrollers
    186 ***************************************************************************/
    187 
    188 /************************
    189 *** Default traversal ***
    190 ************************/
    191 
    192 template<typename Kernel, int Index, int Stop>
    193 struct copy_using_evaluator_DefaultTraversal_CompleteUnrolling
    194 {
    195   // FIXME: this is not very clean, perhaps this information should be provided by the kernel?
    196   typedef typename Kernel::DstEvaluatorType DstEvaluatorType;
    197   typedef typename DstEvaluatorType::XprType DstXprType;
    198 
    199   enum {
    200     outer = Index / DstXprType::InnerSizeAtCompileTime,
    201     inner = Index % DstXprType::InnerSizeAtCompileTime
    202   };
    203 
    204   EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel &kernel)
    205   {
    206     kernel.assignCoeffByOuterInner(outer, inner);
    207     copy_using_evaluator_DefaultTraversal_CompleteUnrolling<Kernel, Index+1, Stop>::run(kernel);
    208   }
    209 };
    210 
    211 template<typename Kernel, int Stop>
    212 struct copy_using_evaluator_DefaultTraversal_CompleteUnrolling<Kernel, Stop, Stop>
    213 {
    214   EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel&) { }
    215 };
    216 
    217 template<typename Kernel, int Index_, int Stop>
    218 struct copy_using_evaluator_DefaultTraversal_InnerUnrolling
    219 {
    220   EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel &kernel, Index outer)
    221   {
    222     kernel.assignCoeffByOuterInner(outer, Index_);
    223     copy_using_evaluator_DefaultTraversal_InnerUnrolling<Kernel, Index_+1, Stop>::run(kernel, outer);
    224   }
    225 };
    226 
    227 template<typename Kernel, int Stop>
    228 struct copy_using_evaluator_DefaultTraversal_InnerUnrolling<Kernel, Stop, Stop>
    229 {
    230   EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel&, Index) { }
    231 };
    232 
    233 /***********************
    234 *** Linear traversal ***
    235 ***********************/
    236 
    237 template<typename Kernel, int Index, int Stop>
    238 struct copy_using_evaluator_LinearTraversal_CompleteUnrolling
    239 {
    240   EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel& kernel)
    241   {
    242     kernel.assignCoeff(Index);
    243     copy_using_evaluator_LinearTraversal_CompleteUnrolling<Kernel, Index+1, Stop>::run(kernel);
    244   }
    245 };
    246 
    247 template<typename Kernel, int Stop>
    248 struct copy_using_evaluator_LinearTraversal_CompleteUnrolling<Kernel, Stop, Stop>
    249 {
    250   EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel&) { }
    251 };
    252 
    253 /**************************
    254 *** Inner vectorization ***
    255 **************************/
    256 
    257 template<typename Kernel, int Index, int Stop>
    258 struct copy_using_evaluator_innervec_CompleteUnrolling
    259 {
    260   // FIXME: this is not very clean, perhaps this information should be provided by the kernel?
    261   typedef typename Kernel::DstEvaluatorType DstEvaluatorType;
    262   typedef typename DstEvaluatorType::XprType DstXprType;
    263   typedef typename Kernel::PacketType PacketType;
    264 
    265   enum {
    266     outer = Index / DstXprType::InnerSizeAtCompileTime,
    267     inner = Index % DstXprType::InnerSizeAtCompileTime,
    268     SrcAlignment = Kernel::AssignmentTraits::SrcAlignment,
    269     DstAlignment = Kernel::AssignmentTraits::DstAlignment
    270   };
    271 
    272   EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel &kernel)
    273   {
    274     kernel.template assignPacketByOuterInner<DstAlignment, SrcAlignment, PacketType>(outer, inner);
    275     enum { NextIndex = Index + unpacket_traits<PacketType>::size };
    276     copy_using_evaluator_innervec_CompleteUnrolling<Kernel, NextIndex, Stop>::run(kernel);
    277   }
    278 };
    279 
    280 template<typename Kernel, int Stop>
    281 struct copy_using_evaluator_innervec_CompleteUnrolling<Kernel, Stop, Stop>
    282 {
    283   EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel&) { }
    284 };
    285 
    286 template<typename Kernel, int Index_, int Stop, int SrcAlignment, int DstAlignment>
    287 struct copy_using_evaluator_innervec_InnerUnrolling
    288 {
    289   typedef typename Kernel::PacketType PacketType;
    290   EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel &kernel, Index outer)
    291   {
    292     kernel.template assignPacketByOuterInner<DstAlignment, SrcAlignment, PacketType>(outer, Index_);
    293     enum { NextIndex = Index_ + unpacket_traits<PacketType>::size };
    294     copy_using_evaluator_innervec_InnerUnrolling<Kernel, NextIndex, Stop, SrcAlignment, DstAlignment>::run(kernel, outer);
    295   }
    296 };
    297 
    298 template<typename Kernel, int Stop, int SrcAlignment, int DstAlignment>
    299 struct copy_using_evaluator_innervec_InnerUnrolling<Kernel, Stop, Stop, SrcAlignment, DstAlignment>
    300 {
    301   EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel &, Index) { }
    302 };
    303 
    304 /***************************************************************************
    305 * Part 3 : implementation of all cases
    306 ***************************************************************************/
    307 
    308 // dense_assignment_loop is based on assign_impl
    309 
    310 template<typename Kernel,
    311          int Traversal = Kernel::AssignmentTraits::Traversal,
    312          int Unrolling = Kernel::AssignmentTraits::Unrolling>
    313 struct dense_assignment_loop;
    314 
    315 /************************
    316 *** Default traversal ***
    317 ************************/
    318 
    319 template<typename Kernel>
    320 struct dense_assignment_loop<Kernel, DefaultTraversal, NoUnrolling>
    321 {
    322   EIGEN_DEVICE_FUNC static void EIGEN_STRONG_INLINE run(Kernel &kernel)
    323   {
    324     for(Index outer = 0; outer < kernel.outerSize(); ++outer) {
    325       for(Index inner = 0; inner < kernel.innerSize(); ++inner) {
    326         kernel.assignCoeffByOuterInner(outer, inner);
    327       }
    328     }
    329   }
    330 };
    331 
    332 template<typename Kernel>
    333 struct dense_assignment_loop<Kernel, DefaultTraversal, CompleteUnrolling>
    334 {
    335   EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel &kernel)
    336   {
    337     typedef typename Kernel::DstEvaluatorType::XprType DstXprType;
    338     copy_using_evaluator_DefaultTraversal_CompleteUnrolling<Kernel, 0, DstXprType::SizeAtCompileTime>::run(kernel);
    339   }
    340 };
    341 
    342 template<typename Kernel>
    343 struct dense_assignment_loop<Kernel, DefaultTraversal, InnerUnrolling>
    344 {
    345   EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel &kernel)
    346   {
    347     typedef typename Kernel::DstEvaluatorType::XprType DstXprType;
    348 
    349     const Index outerSize = kernel.outerSize();
    350     for(Index outer = 0; outer < outerSize; ++outer)
    351       copy_using_evaluator_DefaultTraversal_InnerUnrolling<Kernel, 0, DstXprType::InnerSizeAtCompileTime>::run(kernel, outer);
    352   }
    353 };
    354 
    355 /***************************
    356 *** Linear vectorization ***
    357 ***************************/
    358 
    359 
    360 // The goal of unaligned_dense_assignment_loop is simply to factorize the handling
    361 // of the non vectorizable beginning and ending parts
    362 
    363 template <bool IsAligned = false>
    364 struct unaligned_dense_assignment_loop
    365 {
    366   // if IsAligned = true, then do nothing
    367   template <typename Kernel>
    368   EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel&, Index, Index) {}
    369 };
    370 
    371 template <>
    372 struct unaligned_dense_assignment_loop<false>
    373 {
    374   // MSVC must not inline this functions. If it does, it fails to optimize the
    375   // packet access path.
    376   // FIXME check which version exhibits this issue
    377 #if EIGEN_COMP_MSVC
    378   template <typename Kernel>
    379   static EIGEN_DONT_INLINE void run(Kernel &kernel,
    380                                     Index start,
    381                                     Index end)
    382 #else
    383   template <typename Kernel>
    384   EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel &kernel,
    385                                       Index start,
    386                                       Index end)
    387 #endif
    388   {
    389     for (Index index = start; index < end; ++index)
    390       kernel.assignCoeff(index);
    391   }
    392 };
    393 
    394 template<typename Kernel>
    395 struct dense_assignment_loop<Kernel, LinearVectorizedTraversal, NoUnrolling>
    396 {
    397   EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel &kernel)
    398   {
    399     const Index size = kernel.size();
    400     typedef typename Kernel::Scalar Scalar;
    401     typedef typename Kernel::PacketType PacketType;
    402     enum {
    403       requestedAlignment = Kernel::AssignmentTraits::LinearRequiredAlignment,
    404       packetSize = unpacket_traits<PacketType>::size,
    405       dstIsAligned = int(Kernel::AssignmentTraits::DstAlignment)>=int(requestedAlignment),
    406       dstAlignment = packet_traits<Scalar>::AlignedOnScalar ? int(requestedAlignment)
    407                                                             : int(Kernel::AssignmentTraits::DstAlignment),
    408       srcAlignment = Kernel::AssignmentTraits::JointAlignment
    409     };
    410     const Index alignedStart = dstIsAligned ? 0 : internal::first_aligned<requestedAlignment>(kernel.dstDataPtr(), size);
    411     const Index alignedEnd = alignedStart + ((size-alignedStart)/packetSize)*packetSize;
    412 
    413     unaligned_dense_assignment_loop<dstIsAligned!=0>::run(kernel, 0, alignedStart);
    414 
    415     for(Index index = alignedStart; index < alignedEnd; index += packetSize)
    416       kernel.template assignPacket<dstAlignment, srcAlignment, PacketType>(index);
    417 
    418     unaligned_dense_assignment_loop<>::run(kernel, alignedEnd, size);
    419   }
    420 };
    421 
    422 template<typename Kernel>
    423 struct dense_assignment_loop<Kernel, LinearVectorizedTraversal, CompleteUnrolling>
    424 {
    425   EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel &kernel)
    426   {
    427     typedef typename Kernel::DstEvaluatorType::XprType DstXprType;
    428     typedef typename Kernel::PacketType PacketType;
    429 
    430     enum { size = DstXprType::SizeAtCompileTime,
    431            packetSize =unpacket_traits<PacketType>::size,
    432            alignedSize = (size/packetSize)*packetSize };
    433 
    434     copy_using_evaluator_innervec_CompleteUnrolling<Kernel, 0, alignedSize>::run(kernel);
    435     copy_using_evaluator_DefaultTraversal_CompleteUnrolling<Kernel, alignedSize, size>::run(kernel);
    436   }
    437 };
    438 
    439 /**************************
    440 *** Inner vectorization ***
    441 **************************/
    442 
    443 template<typename Kernel>
    444 struct dense_assignment_loop<Kernel, InnerVectorizedTraversal, NoUnrolling>
    445 {
    446   typedef typename Kernel::PacketType PacketType;
    447   enum {
    448     SrcAlignment = Kernel::AssignmentTraits::SrcAlignment,
    449     DstAlignment = Kernel::AssignmentTraits::DstAlignment
    450   };
    451   EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel &kernel)
    452   {
    453     const Index innerSize = kernel.innerSize();
    454     const Index outerSize = kernel.outerSize();
    455     const Index packetSize = unpacket_traits<PacketType>::size;
    456     for(Index outer = 0; outer < outerSize; ++outer)
    457       for(Index inner = 0; inner < innerSize; inner+=packetSize)
    458         kernel.template assignPacketByOuterInner<DstAlignment, SrcAlignment, PacketType>(outer, inner);
    459   }
    460 };
    461 
    462 template<typename Kernel>
    463 struct dense_assignment_loop<Kernel, InnerVectorizedTraversal, CompleteUnrolling>
    464 {
    465   EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel &kernel)
    466   {
    467     typedef typename Kernel::DstEvaluatorType::XprType DstXprType;
    468     copy_using_evaluator_innervec_CompleteUnrolling<Kernel, 0, DstXprType::SizeAtCompileTime>::run(kernel);
    469   }
    470 };
    471 
    472 template<typename Kernel>
    473 struct dense_assignment_loop<Kernel, InnerVectorizedTraversal, InnerUnrolling>
    474 {
    475   EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel &kernel)
    476   {
    477     typedef typename Kernel::DstEvaluatorType::XprType DstXprType;
    478     typedef typename Kernel::AssignmentTraits Traits;
    479     const Index outerSize = kernel.outerSize();
    480     for(Index outer = 0; outer < outerSize; ++outer)
    481       copy_using_evaluator_innervec_InnerUnrolling<Kernel, 0, DstXprType::InnerSizeAtCompileTime,
    482                                                    Traits::SrcAlignment, Traits::DstAlignment>::run(kernel, outer);
    483   }
    484 };
    485 
    486 /***********************
    487 *** Linear traversal ***
    488 ***********************/
    489 
    490 template<typename Kernel>
    491 struct dense_assignment_loop<Kernel, LinearTraversal, NoUnrolling>
    492 {
    493   EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel &kernel)
    494   {
    495     const Index size = kernel.size();
    496     for(Index i = 0; i < size; ++i)
    497       kernel.assignCoeff(i);
    498   }
    499 };
    500 
    501 template<typename Kernel>
    502 struct dense_assignment_loop<Kernel, LinearTraversal, CompleteUnrolling>
    503 {
    504   EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel &kernel)
    505   {
    506     typedef typename Kernel::DstEvaluatorType::XprType DstXprType;
    507     copy_using_evaluator_LinearTraversal_CompleteUnrolling<Kernel, 0, DstXprType::SizeAtCompileTime>::run(kernel);
    508   }
    509 };
    510 
    511 /**************************
    512 *** Slice vectorization ***
    513 ***************************/
    514 
    515 template<typename Kernel>
    516 struct dense_assignment_loop<Kernel, SliceVectorizedTraversal, NoUnrolling>
    517 {
    518   EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel &kernel)
    519   {
    520     typedef typename Kernel::Scalar Scalar;
    521     typedef typename Kernel::PacketType PacketType;
    522     enum {
    523       packetSize = unpacket_traits<PacketType>::size,
    524       requestedAlignment = int(Kernel::AssignmentTraits::InnerRequiredAlignment),
    525       alignable = packet_traits<Scalar>::AlignedOnScalar || int(Kernel::AssignmentTraits::DstAlignment)>=sizeof(Scalar),
    526       dstIsAligned = int(Kernel::AssignmentTraits::DstAlignment)>=int(requestedAlignment),
    527       dstAlignment = alignable ? int(requestedAlignment)
    528                                : int(Kernel::AssignmentTraits::DstAlignment)
    529     };
    530     const Scalar *dst_ptr = kernel.dstDataPtr();
    531     if((!bool(dstIsAligned)) && (UIntPtr(dst_ptr) % sizeof(Scalar))>0)
    532     {
    533       // the pointer is not aligend-on scalar, so alignment is not possible
    534       return dense_assignment_loop<Kernel,DefaultTraversal,NoUnrolling>::run(kernel);
    535     }
    536     const Index packetAlignedMask = packetSize - 1;
    537     const Index innerSize = kernel.innerSize();
    538     const Index outerSize = kernel.outerSize();
    539     const Index alignedStep = alignable ? (packetSize - kernel.outerStride() % packetSize) & packetAlignedMask : 0;
    540     Index alignedStart = ((!alignable) || bool(dstIsAligned)) ? 0 : internal::first_aligned<requestedAlignment>(dst_ptr, innerSize);
    541 
    542     for(Index outer = 0; outer < outerSize; ++outer)
    543     {
    544       const Index alignedEnd = alignedStart + ((innerSize-alignedStart) & ~packetAlignedMask);
    545       // do the non-vectorizable part of the assignment
    546       for(Index inner = 0; inner<alignedStart ; ++inner)
    547         kernel.assignCoeffByOuterInner(outer, inner);
    548 
    549       // do the vectorizable part of the assignment
    550       for(Index inner = alignedStart; inner<alignedEnd; inner+=packetSize)
    551         kernel.template assignPacketByOuterInner<dstAlignment, Unaligned, PacketType>(outer, inner);
    552 
    553       // do the non-vectorizable part of the assignment
    554       for(Index inner = alignedEnd; inner<innerSize ; ++inner)
    555         kernel.assignCoeffByOuterInner(outer, inner);
    556 
    557       alignedStart = numext::mini((alignedStart+alignedStep)%packetSize, innerSize);
    558     }
    559   }
    560 };
    561 
    562 #if EIGEN_UNALIGNED_VECTORIZE
    563 template<typename Kernel>
    564 struct dense_assignment_loop<Kernel, SliceVectorizedTraversal, InnerUnrolling>
    565 {
    566   EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel &kernel)
    567   {
    568     typedef typename Kernel::DstEvaluatorType::XprType DstXprType;
    569     typedef typename Kernel::PacketType PacketType;
    570 
    571     enum { size = DstXprType::InnerSizeAtCompileTime,
    572            packetSize =unpacket_traits<PacketType>::size,
    573            vectorizableSize = (size/packetSize)*packetSize };
    574 
    575     for(Index outer = 0; outer < kernel.outerSize(); ++outer)
    576     {
    577       copy_using_evaluator_innervec_InnerUnrolling<Kernel, 0, vectorizableSize, 0, 0>::run(kernel, outer);
    578       copy_using_evaluator_DefaultTraversal_InnerUnrolling<Kernel, vectorizableSize, size>::run(kernel, outer);
    579     }
    580   }
    581 };
    582 #endif
    583 
    584 
    585 /***************************************************************************
    586 * Part 4 : Generic dense assignment kernel
    587 ***************************************************************************/
    588 
    589 // This class generalize the assignment of a coefficient (or packet) from one dense evaluator
    590 // to another dense writable evaluator.
    591 // It is parametrized by the two evaluators, and the actual assignment functor.
    592 // This abstraction level permits to keep the evaluation loops as simple and as generic as possible.
    593 // One can customize the assignment using this generic dense_assignment_kernel with different
    594 // functors, or by completely overloading it, by-passing a functor.
    595 template<typename DstEvaluatorTypeT, typename SrcEvaluatorTypeT, typename Functor, int Version = Specialized>
    596 class generic_dense_assignment_kernel
    597 {
    598 protected:
    599   typedef typename DstEvaluatorTypeT::XprType DstXprType;
    600   typedef typename SrcEvaluatorTypeT::XprType SrcXprType;
    601 public:
    602 
    603   typedef DstEvaluatorTypeT DstEvaluatorType;
    604   typedef SrcEvaluatorTypeT SrcEvaluatorType;
    605   typedef typename DstEvaluatorType::Scalar Scalar;
    606   typedef copy_using_evaluator_traits<DstEvaluatorTypeT, SrcEvaluatorTypeT, Functor> AssignmentTraits;
    607   typedef typename AssignmentTraits::PacketType PacketType;
    608 
    609 
    610   EIGEN_DEVICE_FUNC generic_dense_assignment_kernel(DstEvaluatorType &dst, const SrcEvaluatorType &src, const Functor &func, DstXprType& dstExpr)
    611     : m_dst(dst), m_src(src), m_functor(func), m_dstExpr(dstExpr)
    612   {
    613     #ifdef EIGEN_DEBUG_ASSIGN
    614     AssignmentTraits::debug();
    615     #endif
    616   }
    617 
    618   EIGEN_DEVICE_FUNC Index size() const        { return m_dstExpr.size(); }
    619   EIGEN_DEVICE_FUNC Index innerSize() const   { return m_dstExpr.innerSize(); }
    620   EIGEN_DEVICE_FUNC Index outerSize() const   { return m_dstExpr.outerSize(); }
    621   EIGEN_DEVICE_FUNC Index rows() const        { return m_dstExpr.rows(); }
    622   EIGEN_DEVICE_FUNC Index cols() const        { return m_dstExpr.cols(); }
    623   EIGEN_DEVICE_FUNC Index outerStride() const { return m_dstExpr.outerStride(); }
    624 
    625   EIGEN_DEVICE_FUNC DstEvaluatorType& dstEvaluator() { return m_dst; }
    626   EIGEN_DEVICE_FUNC const SrcEvaluatorType& srcEvaluator() const { return m_src; }
    627 
    628   /// Assign src(row,col) to dst(row,col) through the assignment functor.
    629   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignCoeff(Index row, Index col)
    630   {
    631     m_functor.assignCoeff(m_dst.coeffRef(row,col), m_src.coeff(row,col));
    632   }
    633 
    634   /// \sa assignCoeff(Index,Index)
    635   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignCoeff(Index index)
    636   {
    637     m_functor.assignCoeff(m_dst.coeffRef(index), m_src.coeff(index));
    638   }
    639 
    640   /// \sa assignCoeff(Index,Index)
    641   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignCoeffByOuterInner(Index outer, Index inner)
    642   {
    643     Index row = rowIndexByOuterInner(outer, inner);
    644     Index col = colIndexByOuterInner(outer, inner);
    645     assignCoeff(row, col);
    646   }
    647 
    648 
    649   template<int StoreMode, int LoadMode, typename PacketType>
    650   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignPacket(Index row, Index col)
    651   {
    652     m_functor.template assignPacket<StoreMode>(&m_dst.coeffRef(row,col), m_src.template packet<LoadMode,PacketType>(row,col));
    653   }
    654 
    655   template<int StoreMode, int LoadMode, typename PacketType>
    656   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignPacket(Index index)
    657   {
    658     m_functor.template assignPacket<StoreMode>(&m_dst.coeffRef(index), m_src.template packet<LoadMode,PacketType>(index));
    659   }
    660 
    661   template<int StoreMode, int LoadMode, typename PacketType>
    662   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignPacketByOuterInner(Index outer, Index inner)
    663   {
    664     Index row = rowIndexByOuterInner(outer, inner);
    665     Index col = colIndexByOuterInner(outer, inner);
    666     assignPacket<StoreMode,LoadMode,PacketType>(row, col);
    667   }
    668 
    669   EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Index rowIndexByOuterInner(Index outer, Index inner)
    670   {
    671     typedef typename DstEvaluatorType::ExpressionTraits Traits;
    672     return int(Traits::RowsAtCompileTime) == 1 ? 0
    673       : int(Traits::ColsAtCompileTime) == 1 ? inner
    674       : int(DstEvaluatorType::Flags)&RowMajorBit ? outer
    675       : inner;
    676   }
    677 
    678   EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Index colIndexByOuterInner(Index outer, Index inner)
    679   {
    680     typedef typename DstEvaluatorType::ExpressionTraits Traits;
    681     return int(Traits::ColsAtCompileTime) == 1 ? 0
    682       : int(Traits::RowsAtCompileTime) == 1 ? inner
    683       : int(DstEvaluatorType::Flags)&RowMajorBit ? inner
    684       : outer;
    685   }
    686 
    687   EIGEN_DEVICE_FUNC const Scalar* dstDataPtr() const
    688   {
    689     return m_dstExpr.data();
    690   }
    691 
    692 protected:
    693   DstEvaluatorType& m_dst;
    694   const SrcEvaluatorType& m_src;
    695   const Functor &m_functor;
    696   // TODO find a way to avoid the needs of the original expression
    697   DstXprType& m_dstExpr;
    698 };
    699 
    700 /***************************************************************************
    701 * Part 5 : Entry point for dense rectangular assignment
    702 ***************************************************************************/
    703 
    704 template<typename DstXprType,typename SrcXprType, typename Functor>
    705 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
    706 void resize_if_allowed(DstXprType &dst, const SrcXprType& src, const Functor &/*func*/)
    707 {
    708   EIGEN_ONLY_USED_FOR_DEBUG(dst);
    709   EIGEN_ONLY_USED_FOR_DEBUG(src);
    710   eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols());
    711 }
    712 
    713 template<typename DstXprType,typename SrcXprType, typename T1, typename T2>
    714 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
    715 void resize_if_allowed(DstXprType &dst, const SrcXprType& src, const internal::assign_op<T1,T2> &/*func*/)
    716 {
    717   Index dstRows = src.rows();
    718   Index dstCols = src.cols();
    719   if(((dst.rows()!=dstRows) || (dst.cols()!=dstCols)))
    720     dst.resize(dstRows, dstCols);
    721   eigen_assert(dst.rows() == dstRows && dst.cols() == dstCols);
    722 }
    723 
    724 template<typename DstXprType, typename SrcXprType, typename Functor>
    725 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void call_dense_assignment_loop(DstXprType& dst, const SrcXprType& src, const Functor &func)
    726 {
    727   typedef evaluator<DstXprType> DstEvaluatorType;
    728   typedef evaluator<SrcXprType> SrcEvaluatorType;
    729 
    730   SrcEvaluatorType srcEvaluator(src);
    731 
    732   // NOTE To properly handle A = (A*A.transpose())/s with A rectangular,
    733   // we need to resize the destination after the source evaluator has been created.
    734   resize_if_allowed(dst, src, func);
    735 
    736   DstEvaluatorType dstEvaluator(dst);
    737 
    738   typedef generic_dense_assignment_kernel<DstEvaluatorType,SrcEvaluatorType,Functor> Kernel;
    739   Kernel kernel(dstEvaluator, srcEvaluator, func, dst.const_cast_derived());
    740 
    741   dense_assignment_loop<Kernel>::run(kernel);
    742 }
    743 
    744 template<typename DstXprType, typename SrcXprType>
    745 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void call_dense_assignment_loop(DstXprType& dst, const SrcXprType& src)
    746 {
    747   call_dense_assignment_loop(dst, src, internal::assign_op<typename DstXprType::Scalar,typename SrcXprType::Scalar>());
    748 }
    749 
    750 /***************************************************************************
    751 * Part 6 : Generic assignment
    752 ***************************************************************************/
    753 
    754 // Based on the respective shapes of the destination and source,
    755 // the class AssignmentKind determine the kind of assignment mechanism.
    756 // AssignmentKind must define a Kind typedef.
    757 template<typename DstShape, typename SrcShape> struct AssignmentKind;
    758 
    759 // Assignement kind defined in this file:
    760 struct Dense2Dense {};
    761 struct EigenBase2EigenBase {};
    762 
    763 template<typename,typename> struct AssignmentKind { typedef EigenBase2EigenBase Kind; };
    764 template<> struct AssignmentKind<DenseShape,DenseShape> { typedef Dense2Dense Kind; };
    765 
    766 // This is the main assignment class
    767 template< typename DstXprType, typename SrcXprType, typename Functor,
    768           typename Kind = typename AssignmentKind< typename evaluator_traits<DstXprType>::Shape , typename evaluator_traits<SrcXprType>::Shape >::Kind,
    769           typename EnableIf = void>
    770 struct Assignment;
    771 
    772 
    773 // The only purpose of this call_assignment() function is to deal with noalias() / "assume-aliasing" and automatic transposition.
    774 // Indeed, I (Gael) think that this concept of "assume-aliasing" was a mistake, and it makes thing quite complicated.
    775 // So this intermediate function removes everything related to "assume-aliasing" such that Assignment
    776 // does not has to bother about these annoying details.
    777 
    778 template<typename Dst, typename Src>
    779 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
    780 void call_assignment(Dst& dst, const Src& src)
    781 {
    782   call_assignment(dst, src, internal::assign_op<typename Dst::Scalar,typename Src::Scalar>());
    783 }
    784 template<typename Dst, typename Src>
    785 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
    786 void call_assignment(const Dst& dst, const Src& src)
    787 {
    788   call_assignment(dst, src, internal::assign_op<typename Dst::Scalar,typename Src::Scalar>());
    789 }
    790 
    791 // Deal with "assume-aliasing"
    792 template<typename Dst, typename Src, typename Func>
    793 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
    794 void call_assignment(Dst& dst, const Src& src, const Func& func, typename enable_if< evaluator_assume_aliasing<Src>::value, void*>::type = 0)
    795 {
    796   typename plain_matrix_type<Src>::type tmp(src);
    797   call_assignment_no_alias(dst, tmp, func);
    798 }
    799 
    800 template<typename Dst, typename Src, typename Func>
    801 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
    802 void call_assignment(Dst& dst, const Src& src, const Func& func, typename enable_if<!evaluator_assume_aliasing<Src>::value, void*>::type = 0)
    803 {
    804   call_assignment_no_alias(dst, src, func);
    805 }
    806 
    807 // by-pass "assume-aliasing"
    808 // When there is no aliasing, we require that 'dst' has been properly resized
    809 template<typename Dst, template <typename> class StorageBase, typename Src, typename Func>
    810 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
    811 void call_assignment(NoAlias<Dst,StorageBase>& dst, const Src& src, const Func& func)
    812 {
    813   call_assignment_no_alias(dst.expression(), src, func);
    814 }
    815 
    816 
    817 template<typename Dst, typename Src, typename Func>
    818 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
    819 void call_assignment_no_alias(Dst& dst, const Src& src, const Func& func)
    820 {
    821   enum {
    822     NeedToTranspose = (    (int(Dst::RowsAtCompileTime) == 1 && int(Src::ColsAtCompileTime) == 1)
    823                         || (int(Dst::ColsAtCompileTime) == 1 && int(Src::RowsAtCompileTime) == 1)
    824                       ) && int(Dst::SizeAtCompileTime) != 1
    825   };
    826 
    827   typedef typename internal::conditional<NeedToTranspose, Transpose<Dst>, Dst>::type ActualDstTypeCleaned;
    828   typedef typename internal::conditional<NeedToTranspose, Transpose<Dst>, Dst&>::type ActualDstType;
    829   ActualDstType actualDst(dst);
    830 
    831   // TODO check whether this is the right place to perform these checks:
    832   EIGEN_STATIC_ASSERT_LVALUE(Dst)
    833   EIGEN_STATIC_ASSERT_SAME_MATRIX_SIZE(ActualDstTypeCleaned,Src)
    834   EIGEN_CHECK_BINARY_COMPATIBILIY(Func,typename ActualDstTypeCleaned::Scalar,typename Src::Scalar);
    835 
    836   Assignment<ActualDstTypeCleaned,Src,Func>::run(actualDst, src, func);
    837 }
    838 template<typename Dst, typename Src>
    839 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
    840 void call_assignment_no_alias(Dst& dst, const Src& src)
    841 {
    842   call_assignment_no_alias(dst, src, internal::assign_op<typename Dst::Scalar,typename Src::Scalar>());
    843 }
    844 
    845 template<typename Dst, typename Src, typename Func>
    846 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
    847 void call_assignment_no_alias_no_transpose(Dst& dst, const Src& src, const Func& func)
    848 {
    849   // TODO check whether this is the right place to perform these checks:
    850   EIGEN_STATIC_ASSERT_LVALUE(Dst)
    851   EIGEN_STATIC_ASSERT_SAME_MATRIX_SIZE(Dst,Src)
    852   EIGEN_CHECK_BINARY_COMPATIBILIY(Func,typename Dst::Scalar,typename Src::Scalar);
    853 
    854   Assignment<Dst,Src,Func>::run(dst, src, func);
    855 }
    856 template<typename Dst, typename Src>
    857 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
    858 void call_assignment_no_alias_no_transpose(Dst& dst, const Src& src)
    859 {
    860   call_assignment_no_alias_no_transpose(dst, src, internal::assign_op<typename Dst::Scalar,typename Src::Scalar>());
    861 }
    862 
    863 // forward declaration
    864 template<typename Dst, typename Src> void check_for_aliasing(const Dst &dst, const Src &src);
    865 
    866 // Generic Dense to Dense assignment
    867 // Note that the last template argument "Weak" is needed to make it possible to perform
    868 // both partial specialization+SFINAE without ambiguous specialization
    869 template< typename DstXprType, typename SrcXprType, typename Functor, typename Weak>
    870 struct Assignment<DstXprType, SrcXprType, Functor, Dense2Dense, Weak>
    871 {
    872   EIGEN_DEVICE_FUNC
    873   static EIGEN_STRONG_INLINE void run(DstXprType &dst, const SrcXprType &src, const Functor &func)
    874   {
    875 #ifndef EIGEN_NO_DEBUG
    876     internal::check_for_aliasing(dst, src);
    877 #endif
    878 
    879     call_dense_assignment_loop(dst, src, func);
    880   }
    881 };
    882 
    883 // Generic assignment through evalTo.
    884 // TODO: not sure we have to keep that one, but it helps porting current code to new evaluator mechanism.
    885 // Note that the last template argument "Weak" is needed to make it possible to perform
    886 // both partial specialization+SFINAE without ambiguous specialization
    887 template< typename DstXprType, typename SrcXprType, typename Functor, typename Weak>
    888 struct Assignment<DstXprType, SrcXprType, Functor, EigenBase2EigenBase, Weak>
    889 {
    890   EIGEN_DEVICE_FUNC
    891   static EIGEN_STRONG_INLINE void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<typename DstXprType::Scalar,typename SrcXprType::Scalar> &/*func*/)
    892   {
    893     Index dstRows = src.rows();
    894     Index dstCols = src.cols();
    895     if((dst.rows()!=dstRows) || (dst.cols()!=dstCols))
    896       dst.resize(dstRows, dstCols);
    897 
    898     eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols());
    899     src.evalTo(dst);
    900   }
    901 
    902   // NOTE The following two functions are templated to avoid their instanciation if not needed
    903   //      This is needed because some expressions supports evalTo only and/or have 'void' as scalar type.
    904   template<typename SrcScalarType>
    905   EIGEN_DEVICE_FUNC
    906   static EIGEN_STRONG_INLINE void run(DstXprType &dst, const SrcXprType &src, const internal::add_assign_op<typename DstXprType::Scalar,SrcScalarType> &/*func*/)
    907   {
    908     Index dstRows = src.rows();
    909     Index dstCols = src.cols();
    910     if((dst.rows()!=dstRows) || (dst.cols()!=dstCols))
    911       dst.resize(dstRows, dstCols);
    912 
    913     eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols());
    914     src.addTo(dst);
    915   }
    916 
    917   template<typename SrcScalarType>
    918   EIGEN_DEVICE_FUNC
    919   static EIGEN_STRONG_INLINE void run(DstXprType &dst, const SrcXprType &src, const internal::sub_assign_op<typename DstXprType::Scalar,SrcScalarType> &/*func*/)
    920   {
    921     Index dstRows = src.rows();
    922     Index dstCols = src.cols();
    923     if((dst.rows()!=dstRows) || (dst.cols()!=dstCols))
    924       dst.resize(dstRows, dstCols);
    925 
    926     eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols());
    927     src.subTo(dst);
    928   }
    929 };
    930 
    931 } // namespace internal
    932 
    933 } // end namespace Eigen
    934 
    935 #endif // EIGEN_ASSIGN_EVALUATOR_H
    936