Home | History | Annotate | Download | only in util
      1 // This file is part of Eigen, a lightweight C++ template library
      2 // for linear algebra.
      3 //
      4 // Copyright (C) 2008-2010 Gael Guennebaud <gael.guennebaud (at) inria.fr>
      5 // Copyright (C) 2008-2009 Benoit Jacob <jacob.benoit.1 (at) gmail.com>
      6 // Copyright (C) 2009 Kenneth Riddile <kfriddile (at) yahoo.com>
      7 // Copyright (C) 2010 Hauke Heibel <hauke.heibel (at) gmail.com>
      8 // Copyright (C) 2010 Thomas Capricelli <orzel (at) freehackers.org>
      9 //
     10 // This Source Code Form is subject to the terms of the Mozilla
     11 // Public License v. 2.0. If a copy of the MPL was not distributed
     12 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
     13 
     14 
     15 /*****************************************************************************
     16 *** Platform checks for aligned malloc functions                           ***
     17 *****************************************************************************/
     18 
     19 #ifndef EIGEN_MEMORY_H
     20 #define EIGEN_MEMORY_H
     21 
     22 // On 64-bit systems, glibc's malloc returns 16-byte-aligned pointers, see:
     23 //   http://www.gnu.org/s/libc/manual/html_node/Aligned-Memory-Blocks.html
     24 // This is true at least since glibc 2.8.
     25 // This leaves the question how to detect 64-bit. According to this document,
     26 //   http://gcc.fyxm.net/summit/2003/Porting%20to%2064%20bit.pdf
     27 // page 114, "[The] LP64 model [...] is used by all 64-bit UNIX ports" so it's indeed
     28 // quite safe, at least within the context of glibc, to equate 64-bit with LP64.
     29 #if defined(__GLIBC__) && ((__GLIBC__>=2 && __GLIBC_MINOR__ >= 8) || __GLIBC__>2) \
     30  && defined(__LP64__)
     31   #define EIGEN_GLIBC_MALLOC_ALREADY_ALIGNED 1
     32 #else
     33   #define EIGEN_GLIBC_MALLOC_ALREADY_ALIGNED 0
     34 #endif
     35 
     36 // FreeBSD 6 seems to have 16-byte aligned malloc
     37 //   See http://svn.freebsd.org/viewvc/base/stable/6/lib/libc/stdlib/malloc.c?view=markup
     38 // FreeBSD 7 seems to have 16-byte aligned malloc except on ARM and MIPS architectures
     39 //   See http://svn.freebsd.org/viewvc/base/stable/7/lib/libc/stdlib/malloc.c?view=markup
     40 #if defined(__FreeBSD__) && !defined(__arm__) && !defined(__mips__)
     41   #define EIGEN_FREEBSD_MALLOC_ALREADY_ALIGNED 1
     42 #else
     43   #define EIGEN_FREEBSD_MALLOC_ALREADY_ALIGNED 0
     44 #endif
     45 
     46 #if defined(__APPLE__) \
     47  || defined(_WIN64) \
     48  || EIGEN_GLIBC_MALLOC_ALREADY_ALIGNED \
     49  || EIGEN_FREEBSD_MALLOC_ALREADY_ALIGNED
     50   #define EIGEN_MALLOC_ALREADY_ALIGNED 1
     51 #else
     52   #define EIGEN_MALLOC_ALREADY_ALIGNED 0
     53 #endif
     54 
     55 #if ((defined __QNXNTO__) || (defined _GNU_SOURCE) || ((defined _XOPEN_SOURCE) && (_XOPEN_SOURCE >= 600))) \
     56  && (defined _POSIX_ADVISORY_INFO) && (_POSIX_ADVISORY_INFO > 0)
     57   #define EIGEN_HAS_POSIX_MEMALIGN 1
     58 #else
     59   #define EIGEN_HAS_POSIX_MEMALIGN 0
     60 #endif
     61 
     62 #ifdef EIGEN_VECTORIZE_SSE
     63   #define EIGEN_HAS_MM_MALLOC 1
     64 #else
     65   #define EIGEN_HAS_MM_MALLOC 0
     66 #endif
     67 
     68 namespace Eigen {
     69 
     70 namespace internal {
     71 
     72 inline void throw_std_bad_alloc()
     73 {
     74   #ifdef EIGEN_EXCEPTIONS
     75     throw std::bad_alloc();
     76   #else
     77     std::size_t huge = -1;
     78     new int[huge];
     79   #endif
     80 }
     81 
     82 /*****************************************************************************
     83 *** Implementation of handmade aligned functions                           ***
     84 *****************************************************************************/
     85 
     86 /* ----- Hand made implementations of aligned malloc/free and realloc ----- */
     87 
     88 /** \internal Like malloc, but the returned pointer is guaranteed to be 16-byte aligned.
     89   * Fast, but wastes 16 additional bytes of memory. Does not throw any exception.
     90   */
     91 inline void* handmade_aligned_malloc(size_t size)
     92 {
     93   void *original = std::malloc(size+16);
     94   if (original == 0) return 0;
     95   void *aligned = reinterpret_cast<void*>((reinterpret_cast<size_t>(original) & ~(size_t(15))) + 16);
     96   *(reinterpret_cast<void**>(aligned) - 1) = original;
     97   return aligned;
     98 }
     99 
    100 /** \internal Frees memory allocated with handmade_aligned_malloc */
    101 inline void handmade_aligned_free(void *ptr)
    102 {
    103   if (ptr) std::free(*(reinterpret_cast<void**>(ptr) - 1));
    104 }
    105 
    106 /** \internal
    107   * \brief Reallocates aligned memory.
    108   * Since we know that our handmade version is based on std::realloc
    109   * we can use std::realloc to implement efficient reallocation.
    110   */
    111 inline void* handmade_aligned_realloc(void* ptr, size_t size, size_t = 0)
    112 {
    113   if (ptr == 0) return handmade_aligned_malloc(size);
    114   void *original = *(reinterpret_cast<void**>(ptr) - 1);
    115   original = std::realloc(original,size+16);
    116   if (original == 0) return 0;
    117   void *aligned = reinterpret_cast<void*>((reinterpret_cast<size_t>(original) & ~(size_t(15))) + 16);
    118   *(reinterpret_cast<void**>(aligned) - 1) = original;
    119   return aligned;
    120 }
    121 
    122 /*****************************************************************************
    123 *** Implementation of generic aligned realloc (when no realloc can be used)***
    124 *****************************************************************************/
    125 
    126 void* aligned_malloc(size_t size);
    127 void  aligned_free(void *ptr);
    128 
    129 /** \internal
    130   * \brief Reallocates aligned memory.
    131   * Allows reallocation with aligned ptr types. This implementation will
    132   * always create a new memory chunk and copy the old data.
    133   */
    134 inline void* generic_aligned_realloc(void* ptr, size_t size, size_t old_size)
    135 {
    136   if (ptr==0)
    137     return aligned_malloc(size);
    138 
    139   if (size==0)
    140   {
    141     aligned_free(ptr);
    142     return 0;
    143   }
    144 
    145   void* newptr = aligned_malloc(size);
    146   if (newptr == 0)
    147   {
    148     #ifdef EIGEN_HAS_ERRNO
    149     errno = ENOMEM; // according to the standard
    150     #endif
    151     return 0;
    152   }
    153 
    154   if (ptr != 0)
    155   {
    156     std::memcpy(newptr, ptr, (std::min)(size,old_size));
    157     aligned_free(ptr);
    158   }
    159 
    160   return newptr;
    161 }
    162 
    163 /*****************************************************************************
    164 *** Implementation of portable aligned versions of malloc/free/realloc     ***
    165 *****************************************************************************/
    166 
    167 #ifdef EIGEN_NO_MALLOC
    168 inline void check_that_malloc_is_allowed()
    169 {
    170   eigen_assert(false && "heap allocation is forbidden (EIGEN_NO_MALLOC is defined)");
    171 }
    172 #elif defined EIGEN_RUNTIME_NO_MALLOC
    173 inline bool is_malloc_allowed_impl(bool update, bool new_value = false)
    174 {
    175   static bool value = true;
    176   if (update == 1)
    177     value = new_value;
    178   return value;
    179 }
    180 inline bool is_malloc_allowed() { return is_malloc_allowed_impl(false); }
    181 inline bool set_is_malloc_allowed(bool new_value) { return is_malloc_allowed_impl(true, new_value); }
    182 inline void check_that_malloc_is_allowed()
    183 {
    184   eigen_assert(is_malloc_allowed() && "heap allocation is forbidden (EIGEN_RUNTIME_NO_MALLOC is defined and g_is_malloc_allowed is false)");
    185 }
    186 #else
    187 inline void check_that_malloc_is_allowed()
    188 {}
    189 #endif
    190 
    191 /** \internal Allocates \a size bytes. The returned pointer is guaranteed to have 16 bytes alignment.
    192   * On allocation error, the returned pointer is null, and std::bad_alloc is thrown.
    193   */
    194 inline void* aligned_malloc(size_t size)
    195 {
    196   check_that_malloc_is_allowed();
    197 
    198   void *result;
    199   #if !EIGEN_ALIGN
    200     result = std::malloc(size);
    201   #elif EIGEN_MALLOC_ALREADY_ALIGNED
    202     result = std::malloc(size);
    203   #elif EIGEN_HAS_POSIX_MEMALIGN
    204     if(posix_memalign(&result, 16, size)) result = 0;
    205   #elif EIGEN_HAS_MM_MALLOC
    206     result = _mm_malloc(size, 16);
    207   #elif (defined _MSC_VER)
    208     result = _aligned_malloc(size, 16);
    209   #else
    210     result = handmade_aligned_malloc(size);
    211   #endif
    212 
    213   if(!result && size)
    214     throw_std_bad_alloc();
    215 
    216   return result;
    217 }
    218 
    219 /** \internal Frees memory allocated with aligned_malloc. */
    220 inline void aligned_free(void *ptr)
    221 {
    222   #if !EIGEN_ALIGN
    223     std::free(ptr);
    224   #elif EIGEN_MALLOC_ALREADY_ALIGNED
    225     std::free(ptr);
    226   #elif EIGEN_HAS_POSIX_MEMALIGN
    227     std::free(ptr);
    228   #elif EIGEN_HAS_MM_MALLOC
    229     _mm_free(ptr);
    230   #elif defined(_MSC_VER)
    231     _aligned_free(ptr);
    232   #else
    233     handmade_aligned_free(ptr);
    234   #endif
    235 }
    236 
    237 /**
    238 * \internal
    239 * \brief Reallocates an aligned block of memory.
    240 * \throws std::bad_alloc on allocation failure
    241 **/
    242 inline void* aligned_realloc(void *ptr, size_t new_size, size_t old_size)
    243 {
    244   EIGEN_UNUSED_VARIABLE(old_size);
    245 
    246   void *result;
    247 #if !EIGEN_ALIGN
    248   result = std::realloc(ptr,new_size);
    249 #elif EIGEN_MALLOC_ALREADY_ALIGNED
    250   result = std::realloc(ptr,new_size);
    251 #elif EIGEN_HAS_POSIX_MEMALIGN
    252   result = generic_aligned_realloc(ptr,new_size,old_size);
    253 #elif EIGEN_HAS_MM_MALLOC
    254   // The defined(_mm_free) is just here to verify that this MSVC version
    255   // implements _mm_malloc/_mm_free based on the corresponding _aligned_
    256   // functions. This may not always be the case and we just try to be safe.
    257   #if defined(_MSC_VER) && defined(_mm_free)
    258     result = _aligned_realloc(ptr,new_size,16);
    259   #else
    260     result = generic_aligned_realloc(ptr,new_size,old_size);
    261   #endif
    262 #elif defined(_MSC_VER)
    263   result = _aligned_realloc(ptr,new_size,16);
    264 #else
    265   result = handmade_aligned_realloc(ptr,new_size,old_size);
    266 #endif
    267 
    268   if (!result && new_size)
    269     throw_std_bad_alloc();
    270 
    271   return result;
    272 }
    273 
    274 /*****************************************************************************
    275 *** Implementation of conditionally aligned functions                      ***
    276 *****************************************************************************/
    277 
    278 /** \internal Allocates \a size bytes. If Align is true, then the returned ptr is 16-byte-aligned.
    279   * On allocation error, the returned pointer is null, and a std::bad_alloc is thrown.
    280   */
    281 template<bool Align> inline void* conditional_aligned_malloc(size_t size)
    282 {
    283   return aligned_malloc(size);
    284 }
    285 
    286 template<> inline void* conditional_aligned_malloc<false>(size_t size)
    287 {
    288   check_that_malloc_is_allowed();
    289 
    290   void *result = std::malloc(size);
    291   if(!result && size)
    292     throw_std_bad_alloc();
    293   return result;
    294 }
    295 
    296 /** \internal Frees memory allocated with conditional_aligned_malloc */
    297 template<bool Align> inline void conditional_aligned_free(void *ptr)
    298 {
    299   aligned_free(ptr);
    300 }
    301 
    302 template<> inline void conditional_aligned_free<false>(void *ptr)
    303 {
    304   std::free(ptr);
    305 }
    306 
    307 template<bool Align> inline void* conditional_aligned_realloc(void* ptr, size_t new_size, size_t old_size)
    308 {
    309   return aligned_realloc(ptr, new_size, old_size);
    310 }
    311 
    312 template<> inline void* conditional_aligned_realloc<false>(void* ptr, size_t new_size, size_t)
    313 {
    314   return std::realloc(ptr, new_size);
    315 }
    316 
    317 /*****************************************************************************
    318 *** Construction/destruction of array elements                             ***
    319 *****************************************************************************/
    320 
    321 /** \internal Constructs the elements of an array.
    322   * The \a size parameter tells on how many objects to call the constructor of T.
    323   */
    324 template<typename T> inline T* construct_elements_of_array(T *ptr, size_t size)
    325 {
    326   for (size_t i=0; i < size; ++i) ::new (ptr + i) T;
    327   return ptr;
    328 }
    329 
    330 /** \internal Destructs the elements of an array.
    331   * The \a size parameters tells on how many objects to call the destructor of T.
    332   */
    333 template<typename T> inline void destruct_elements_of_array(T *ptr, size_t size)
    334 {
    335   // always destruct an array starting from the end.
    336   if(ptr)
    337     while(size) ptr[--size].~T();
    338 }
    339 
    340 /*****************************************************************************
    341 *** Implementation of aligned new/delete-like functions                    ***
    342 *****************************************************************************/
    343 
    344 template<typename T>
    345 EIGEN_ALWAYS_INLINE void check_size_for_overflow(size_t size)
    346 {
    347   if(size > size_t(-1) / sizeof(T))
    348     throw_std_bad_alloc();
    349 }
    350 
    351 /** \internal Allocates \a size objects of type T. The returned pointer is guaranteed to have 16 bytes alignment.
    352   * On allocation error, the returned pointer is undefined, but a std::bad_alloc is thrown.
    353   * The default constructor of T is called.
    354   */
    355 template<typename T> inline T* aligned_new(size_t size)
    356 {
    357   check_size_for_overflow<T>(size);
    358   T *result = reinterpret_cast<T*>(aligned_malloc(sizeof(T)*size));
    359   return construct_elements_of_array(result, size);
    360 }
    361 
    362 template<typename T, bool Align> inline T* conditional_aligned_new(size_t size)
    363 {
    364   check_size_for_overflow<T>(size);
    365   T *result = reinterpret_cast<T*>(conditional_aligned_malloc<Align>(sizeof(T)*size));
    366   return construct_elements_of_array(result, size);
    367 }
    368 
    369 /** \internal Deletes objects constructed with aligned_new
    370   * The \a size parameters tells on how many objects to call the destructor of T.
    371   */
    372 template<typename T> inline void aligned_delete(T *ptr, size_t size)
    373 {
    374   destruct_elements_of_array<T>(ptr, size);
    375   aligned_free(ptr);
    376 }
    377 
    378 /** \internal Deletes objects constructed with conditional_aligned_new
    379   * The \a size parameters tells on how many objects to call the destructor of T.
    380   */
    381 template<typename T, bool Align> inline void conditional_aligned_delete(T *ptr, size_t size)
    382 {
    383   destruct_elements_of_array<T>(ptr, size);
    384   conditional_aligned_free<Align>(ptr);
    385 }
    386 
    387 template<typename T, bool Align> inline T* conditional_aligned_realloc_new(T* pts, size_t new_size, size_t old_size)
    388 {
    389   check_size_for_overflow<T>(new_size);
    390   check_size_for_overflow<T>(old_size);
    391   if(new_size < old_size)
    392     destruct_elements_of_array(pts+new_size, old_size-new_size);
    393   T *result = reinterpret_cast<T*>(conditional_aligned_realloc<Align>(reinterpret_cast<void*>(pts), sizeof(T)*new_size, sizeof(T)*old_size));
    394   if(new_size > old_size)
    395     construct_elements_of_array(result+old_size, new_size-old_size);
    396   return result;
    397 }
    398 
    399 
    400 template<typename T, bool Align> inline T* conditional_aligned_new_auto(size_t size)
    401 {
    402   check_size_for_overflow<T>(size);
    403   T *result = reinterpret_cast<T*>(conditional_aligned_malloc<Align>(sizeof(T)*size));
    404   if(NumTraits<T>::RequireInitialization)
    405     construct_elements_of_array(result, size);
    406   return result;
    407 }
    408 
    409 template<typename T, bool Align> inline T* conditional_aligned_realloc_new_auto(T* pts, size_t new_size, size_t old_size)
    410 {
    411   check_size_for_overflow<T>(new_size);
    412   check_size_for_overflow<T>(old_size);
    413   if(NumTraits<T>::RequireInitialization && (new_size < old_size))
    414     destruct_elements_of_array(pts+new_size, old_size-new_size);
    415   T *result = reinterpret_cast<T*>(conditional_aligned_realloc<Align>(reinterpret_cast<void*>(pts), sizeof(T)*new_size, sizeof(T)*old_size));
    416   if(NumTraits<T>::RequireInitialization && (new_size > old_size))
    417     construct_elements_of_array(result+old_size, new_size-old_size);
    418   return result;
    419 }
    420 
    421 template<typename T, bool Align> inline void conditional_aligned_delete_auto(T *ptr, size_t size)
    422 {
    423   if(NumTraits<T>::RequireInitialization)
    424     destruct_elements_of_array<T>(ptr, size);
    425   conditional_aligned_free<Align>(ptr);
    426 }
    427 
    428 /****************************************************************************/
    429 
    430 /** \internal Returns the index of the first element of the array that is well aligned for vectorization.
    431   *
    432   * \param array the address of the start of the array
    433   * \param size the size of the array
    434   *
    435   * \note If no element of the array is well aligned, the size of the array is returned. Typically,
    436   * for example with SSE, "well aligned" means 16-byte-aligned. If vectorization is disabled or if the
    437   * packet size for the given scalar type is 1, then everything is considered well-aligned.
    438   *
    439   * \note If the scalar type is vectorizable, we rely on the following assumptions: sizeof(Scalar) is a
    440   * power of 2, the packet size in bytes is also a power of 2, and is a multiple of sizeof(Scalar). On the
    441   * other hand, we do not assume that the array address is a multiple of sizeof(Scalar), as that fails for
    442   * example with Scalar=double on certain 32-bit platforms, see bug #79.
    443   *
    444   * There is also the variant first_aligned(const MatrixBase&) defined in DenseCoeffsBase.h.
    445   */
    446 template<typename Scalar, typename Index>
    447 static inline Index first_aligned(const Scalar* array, Index size)
    448 {
    449   typedef typename packet_traits<Scalar>::type Packet;
    450   enum { PacketSize = packet_traits<Scalar>::size,
    451          PacketAlignedMask = PacketSize-1
    452   };
    453 
    454   if(PacketSize==1)
    455   {
    456     // Either there is no vectorization, or a packet consists of exactly 1 scalar so that all elements
    457     // of the array have the same alignment.
    458     return 0;
    459   }
    460   else if(size_t(array) & (sizeof(Scalar)-1))
    461   {
    462     // There is vectorization for this scalar type, but the array is not aligned to the size of a single scalar.
    463     // Consequently, no element of the array is well aligned.
    464     return size;
    465   }
    466   else
    467   {
    468     return std::min<Index>( (PacketSize - (Index((size_t(array)/sizeof(Scalar))) & PacketAlignedMask))
    469                            & PacketAlignedMask, size);
    470   }
    471 }
    472 
    473 
    474 // std::copy is much slower than memcpy, so let's introduce a smart_copy which
    475 // use memcpy on trivial types, i.e., on types that does not require an initialization ctor.
    476 template<typename T, bool UseMemcpy> struct smart_copy_helper;
    477 
    478 template<typename T> void smart_copy(const T* start, const T* end, T* target)
    479 {
    480   smart_copy_helper<T,!NumTraits<T>::RequireInitialization>::run(start, end, target);
    481 }
    482 
    483 template<typename T> struct smart_copy_helper<T,true> {
    484   static inline void run(const T* start, const T* end, T* target)
    485   { memcpy(target, start, std::ptrdiff_t(end)-std::ptrdiff_t(start)); }
    486 };
    487 
    488 template<typename T> struct smart_copy_helper<T,false> {
    489   static inline void run(const T* start, const T* end, T* target)
    490   { std::copy(start, end, target); }
    491 };
    492 
    493 
    494 /*****************************************************************************
    495 *** Implementation of runtime stack allocation (falling back to malloc)    ***
    496 *****************************************************************************/
    497 
    498 // you can overwrite Eigen's default behavior regarding alloca by defining EIGEN_ALLOCA
    499 // to the appropriate stack allocation function
    500 #ifndef EIGEN_ALLOCA
    501   #if (defined __linux__)
    502     #define EIGEN_ALLOCA alloca
    503   #elif defined(_MSC_VER)
    504     #define EIGEN_ALLOCA _alloca
    505   #endif
    506 #endif
    507 
    508 // This helper class construct the allocated memory, and takes care of destructing and freeing the handled data
    509 // at destruction time. In practice this helper class is mainly useful to avoid memory leak in case of exceptions.
    510 template<typename T> class aligned_stack_memory_handler
    511 {
    512   public:
    513     /* Creates a stack_memory_handler responsible for the buffer \a ptr of size \a size.
    514      * Note that \a ptr can be 0 regardless of the other parameters.
    515      * This constructor takes care of constructing/initializing the elements of the buffer if required by the scalar type T (see NumTraits<T>::RequireInitialization).
    516      * In this case, the buffer elements will also be destructed when this handler will be destructed.
    517      * Finally, if \a dealloc is true, then the pointer \a ptr is freed.
    518      **/
    519     aligned_stack_memory_handler(T* ptr, size_t size, bool dealloc)
    520       : m_ptr(ptr), m_size(size), m_deallocate(dealloc)
    521     {
    522       if(NumTraits<T>::RequireInitialization && m_ptr)
    523         Eigen::internal::construct_elements_of_array(m_ptr, size);
    524     }
    525     ~aligned_stack_memory_handler()
    526     {
    527       if(NumTraits<T>::RequireInitialization && m_ptr)
    528         Eigen::internal::destruct_elements_of_array<T>(m_ptr, m_size);
    529       if(m_deallocate)
    530         Eigen::internal::aligned_free(m_ptr);
    531     }
    532   protected:
    533     T* m_ptr;
    534     size_t m_size;
    535     bool m_deallocate;
    536 };
    537 
    538 } // end namespace internal
    539 
    540 /** \internal
    541   * Declares, allocates and construct an aligned buffer named NAME of SIZE elements of type TYPE on the stack
    542   * if SIZE is smaller than EIGEN_STACK_ALLOCATION_LIMIT, and if stack allocation is supported by the platform
    543   * (currently, this is Linux and Visual Studio only). Otherwise the memory is allocated on the heap.
    544   * The allocated buffer is automatically deleted when exiting the scope of this declaration.
    545   * If BUFFER is non null, then the declared variable is simply an alias for BUFFER, and no allocation/deletion occurs.
    546   * Here is an example:
    547   * \code
    548   * {
    549   *   ei_declare_aligned_stack_constructed_variable(float,data,size,0);
    550   *   // use data[0] to data[size-1]
    551   * }
    552   * \endcode
    553   * The underlying stack allocation function can controlled with the EIGEN_ALLOCA preprocessor token.
    554   */
    555 #ifdef EIGEN_ALLOCA
    556 
    557   #ifdef __arm__
    558     #define EIGEN_ALIGNED_ALLOCA(SIZE) reinterpret_cast<void*>((reinterpret_cast<size_t>(EIGEN_ALLOCA(SIZE+16)) & ~(size_t(15))) + 16)
    559   #else
    560     #define EIGEN_ALIGNED_ALLOCA EIGEN_ALLOCA
    561   #endif
    562 
    563   #define ei_declare_aligned_stack_constructed_variable(TYPE,NAME,SIZE,BUFFER) \
    564     Eigen::internal::check_size_for_overflow<TYPE>(SIZE); \
    565     TYPE* NAME = (BUFFER)!=0 ? (BUFFER) \
    566                : reinterpret_cast<TYPE*>( \
    567                       (sizeof(TYPE)*SIZE<=EIGEN_STACK_ALLOCATION_LIMIT) ? EIGEN_ALIGNED_ALLOCA(sizeof(TYPE)*SIZE) \
    568                     : Eigen::internal::aligned_malloc(sizeof(TYPE)*SIZE) );  \
    569     Eigen::internal::aligned_stack_memory_handler<TYPE> EIGEN_CAT(NAME,_stack_memory_destructor)((BUFFER)==0 ? NAME : 0,SIZE,sizeof(TYPE)*SIZE>EIGEN_STACK_ALLOCATION_LIMIT)
    570 
    571 #else
    572 
    573   #define ei_declare_aligned_stack_constructed_variable(TYPE,NAME,SIZE,BUFFER) \
    574     Eigen::internal::check_size_for_overflow<TYPE>(SIZE); \
    575     TYPE* NAME = (BUFFER)!=0 ? BUFFER : reinterpret_cast<TYPE*>(Eigen::internal::aligned_malloc(sizeof(TYPE)*SIZE));    \
    576     Eigen::internal::aligned_stack_memory_handler<TYPE> EIGEN_CAT(NAME,_stack_memory_destructor)((BUFFER)==0 ? NAME : 0,SIZE,true)
    577 
    578 #endif
    579 
    580 
    581 /*****************************************************************************
    582 *** Implementation of EIGEN_MAKE_ALIGNED_OPERATOR_NEW [_IF]                ***
    583 *****************************************************************************/
    584 
    585 #if EIGEN_ALIGN
    586   #ifdef EIGEN_EXCEPTIONS
    587     #define EIGEN_MAKE_ALIGNED_OPERATOR_NEW_NOTHROW(NeedsToAlign) \
    588       void* operator new(size_t size, const std::nothrow_t&) throw() { \
    589         try { return Eigen::internal::conditional_aligned_malloc<NeedsToAlign>(size); } \
    590         catch (...) { return 0; } \
    591         return 0; \
    592       }
    593   #else
    594     #define EIGEN_MAKE_ALIGNED_OPERATOR_NEW_NOTHROW(NeedsToAlign) \
    595       void* operator new(size_t size, const std::nothrow_t&) throw() { \
    596         return Eigen::internal::conditional_aligned_malloc<NeedsToAlign>(size); \
    597       }
    598   #endif
    599 
    600   #define EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF(NeedsToAlign) \
    601       void *operator new(size_t size) { \
    602         return Eigen::internal::conditional_aligned_malloc<NeedsToAlign>(size); \
    603       } \
    604       void *operator new[](size_t size) { \
    605         return Eigen::internal::conditional_aligned_malloc<NeedsToAlign>(size); \
    606       } \
    607       void operator delete(void * ptr) throw() { Eigen::internal::conditional_aligned_free<NeedsToAlign>(ptr); } \
    608       void operator delete[](void * ptr) throw() { Eigen::internal::conditional_aligned_free<NeedsToAlign>(ptr); } \
    609       /* in-place new and delete. since (at least afaik) there is no actual   */ \
    610       /* memory allocated we can safely let the default implementation handle */ \
    611       /* this particular case. */ \
    612       static void *operator new(size_t size, void *ptr) { return ::operator new(size,ptr); } \
    613       void operator delete(void * memory, void *ptr) throw() { return ::operator delete(memory,ptr); } \
    614       /* nothrow-new (returns zero instead of std::bad_alloc) */ \
    615       EIGEN_MAKE_ALIGNED_OPERATOR_NEW_NOTHROW(NeedsToAlign) \
    616       void operator delete(void *ptr, const std::nothrow_t&) throw() { \
    617         Eigen::internal::conditional_aligned_free<NeedsToAlign>(ptr); \
    618       } \
    619       typedef void eigen_aligned_operator_new_marker_type;
    620 #else
    621   #define EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF(NeedsToAlign)
    622 #endif
    623 
    624 #define EIGEN_MAKE_ALIGNED_OPERATOR_NEW EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF(true)
    625 #define EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF_VECTORIZABLE_FIXED_SIZE(Scalar,Size) \
    626   EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF(bool(((Size)!=Eigen::Dynamic) && ((sizeof(Scalar)*(Size))%16==0)))
    627 
    628 /****************************************************************************/
    629 
    630 /** \class aligned_allocator
    631 * \ingroup Core_Module
    632 *
    633 * \brief STL compatible allocator to use with with 16 byte aligned types
    634 *
    635 * Example:
    636 * \code
    637 * // Matrix4f requires 16 bytes alignment:
    638 * std::map< int, Matrix4f, std::less<int>,
    639 *           aligned_allocator<std::pair<const int, Matrix4f> > > my_map_mat4;
    640 * // Vector3f does not require 16 bytes alignment, no need to use Eigen's allocator:
    641 * std::map< int, Vector3f > my_map_vec3;
    642 * \endcode
    643 *
    644 * \sa \ref TopicStlContainers.
    645 */
    646 template<class T>
    647 class aligned_allocator
    648 {
    649 public:
    650     typedef size_t    size_type;
    651     typedef std::ptrdiff_t difference_type;
    652     typedef T*        pointer;
    653     typedef const T*  const_pointer;
    654     typedef T&        reference;
    655     typedef const T&  const_reference;
    656     typedef T         value_type;
    657 
    658     template<class U>
    659     struct rebind
    660     {
    661         typedef aligned_allocator<U> other;
    662     };
    663 
    664     pointer address( reference value ) const
    665     {
    666         return &value;
    667     }
    668 
    669     const_pointer address( const_reference value ) const
    670     {
    671         return &value;
    672     }
    673 
    674     aligned_allocator()
    675     {
    676     }
    677 
    678     aligned_allocator( const aligned_allocator& )
    679     {
    680     }
    681 
    682     template<class U>
    683     aligned_allocator( const aligned_allocator<U>& )
    684     {
    685     }
    686 
    687     ~aligned_allocator()
    688     {
    689     }
    690 
    691     size_type max_size() const
    692     {
    693         return (std::numeric_limits<size_type>::max)();
    694     }
    695 
    696     pointer allocate( size_type num, const void* hint = 0 )
    697     {
    698         EIGEN_UNUSED_VARIABLE(hint);
    699         internal::check_size_for_overflow<T>(num);
    700         return static_cast<pointer>( internal::aligned_malloc( num * sizeof(T) ) );
    701     }
    702 
    703     void construct( pointer p, const T& value )
    704     {
    705         ::new( p ) T( value );
    706     }
    707 
    708     // Support for c++11
    709 #if (__cplusplus >= 201103L)
    710     template<typename... Args>
    711     void  construct(pointer p, Args&&... args)
    712     {
    713       ::new(p) T(std::forward<Args>(args)...);
    714     }
    715 #endif
    716 
    717     void destroy( pointer p )
    718     {
    719         p->~T();
    720     }
    721 
    722     void deallocate( pointer p, size_type /*num*/ )
    723     {
    724         internal::aligned_free( p );
    725     }
    726 
    727     bool operator!=(const aligned_allocator<T>& ) const
    728     { return false; }
    729 
    730     bool operator==(const aligned_allocator<T>& ) const
    731     { return true; }
    732 };
    733 
    734 //---------- Cache sizes ----------
    735 
    736 #if !defined(EIGEN_NO_CPUID)
    737 #  if defined(__GNUC__) && ( defined(__i386__) || defined(__x86_64__) )
    738 #    if defined(__PIC__) && defined(__i386__)
    739        // Case for x86 with PIC
    740 #      define EIGEN_CPUID(abcd,func,id) \
    741          __asm__ __volatile__ ("xchgl %%ebx, %%esi;cpuid; xchgl %%ebx,%%esi": "=a" (abcd[0]), "=S" (abcd[1]), "=c" (abcd[2]), "=d" (abcd[3]) : "a" (func), "c" (id));
    742 #    else
    743        // Case for x86_64 or x86 w/o PIC
    744 #      define EIGEN_CPUID(abcd,func,id) \
    745          __asm__ __volatile__ ("cpuid": "=a" (abcd[0]), "=b" (abcd[1]), "=c" (abcd[2]), "=d" (abcd[3]) : "a" (func), "c" (id) );
    746 #    endif
    747 #  elif defined(_MSC_VER)
    748 #    if (_MSC_VER > 1500)
    749 #      define EIGEN_CPUID(abcd,func,id) __cpuidex((int*)abcd,func,id)
    750 #    endif
    751 #  endif
    752 #endif
    753 
    754 namespace internal {
    755 
    756 #ifdef EIGEN_CPUID
    757 
    758 inline bool cpuid_is_vendor(int abcd[4], const char* vendor)
    759 {
    760   return abcd[1]==(reinterpret_cast<const int*>(vendor))[0] && abcd[3]==(reinterpret_cast<const int*>(vendor))[1] && abcd[2]==(reinterpret_cast<const int*>(vendor))[2];
    761 }
    762 
    763 inline void queryCacheSizes_intel_direct(int& l1, int& l2, int& l3)
    764 {
    765   int abcd[4];
    766   l1 = l2 = l3 = 0;
    767   int cache_id = 0;
    768   int cache_type = 0;
    769   do {
    770     abcd[0] = abcd[1] = abcd[2] = abcd[3] = 0;
    771     EIGEN_CPUID(abcd,0x4,cache_id);
    772     cache_type  = (abcd[0] & 0x0F) >> 0;
    773     if(cache_type==1||cache_type==3) // data or unified cache
    774     {
    775       int cache_level = (abcd[0] & 0xE0) >> 5;  // A[7:5]
    776       int ways        = (abcd[1] & 0xFFC00000) >> 22; // B[31:22]
    777       int partitions  = (abcd[1] & 0x003FF000) >> 12; // B[21:12]
    778       int line_size   = (abcd[1] & 0x00000FFF) >>  0; // B[11:0]
    779       int sets        = (abcd[2]);                    // C[31:0]
    780 
    781       int cache_size = (ways+1) * (partitions+1) * (line_size+1) * (sets+1);
    782 
    783       switch(cache_level)
    784       {
    785         case 1: l1 = cache_size; break;
    786         case 2: l2 = cache_size; break;
    787         case 3: l3 = cache_size; break;
    788         default: break;
    789       }
    790     }
    791     cache_id++;
    792   } while(cache_type>0 && cache_id<16);
    793 }
    794 
    795 inline void queryCacheSizes_intel_codes(int& l1, int& l2, int& l3)
    796 {
    797   int abcd[4];
    798   abcd[0] = abcd[1] = abcd[2] = abcd[3] = 0;
    799   l1 = l2 = l3 = 0;
    800   EIGEN_CPUID(abcd,0x00000002,0);
    801   unsigned char * bytes = reinterpret_cast<unsigned char *>(abcd)+2;
    802   bool check_for_p2_core2 = false;
    803   for(int i=0; i<14; ++i)
    804   {
    805     switch(bytes[i])
    806     {
    807       case 0x0A: l1 = 8; break;   // 0Ah   data L1 cache, 8 KB, 2 ways, 32 byte lines
    808       case 0x0C: l1 = 16; break;  // 0Ch   data L1 cache, 16 KB, 4 ways, 32 byte lines
    809       case 0x0E: l1 = 24; break;  // 0Eh   data L1 cache, 24 KB, 6 ways, 64 byte lines
    810       case 0x10: l1 = 16; break;  // 10h   data L1 cache, 16 KB, 4 ways, 32 byte lines (IA-64)
    811       case 0x15: l1 = 16; break;  // 15h   code L1 cache, 16 KB, 4 ways, 32 byte lines (IA-64)
    812       case 0x2C: l1 = 32; break;  // 2Ch   data L1 cache, 32 KB, 8 ways, 64 byte lines
    813       case 0x30: l1 = 32; break;  // 30h   code L1 cache, 32 KB, 8 ways, 64 byte lines
    814       case 0x60: l1 = 16; break;  // 60h   data L1 cache, 16 KB, 8 ways, 64 byte lines, sectored
    815       case 0x66: l1 = 8; break;   // 66h   data L1 cache, 8 KB, 4 ways, 64 byte lines, sectored
    816       case 0x67: l1 = 16; break;  // 67h   data L1 cache, 16 KB, 4 ways, 64 byte lines, sectored
    817       case 0x68: l1 = 32; break;  // 68h   data L1 cache, 32 KB, 4 ways, 64 byte lines, sectored
    818       case 0x1A: l2 = 96; break;   // code and data L2 cache, 96 KB, 6 ways, 64 byte lines (IA-64)
    819       case 0x22: l3 = 512; break;   // code and data L3 cache, 512 KB, 4 ways (!), 64 byte lines, dual-sectored
    820       case 0x23: l3 = 1024; break;   // code and data L3 cache, 1024 KB, 8 ways, 64 byte lines, dual-sectored
    821       case 0x25: l3 = 2048; break;   // code and data L3 cache, 2048 KB, 8 ways, 64 byte lines, dual-sectored
    822       case 0x29: l3 = 4096; break;   // code and data L3 cache, 4096 KB, 8 ways, 64 byte lines, dual-sectored
    823       case 0x39: l2 = 128; break;   // code and data L2 cache, 128 KB, 4 ways, 64 byte lines, sectored
    824       case 0x3A: l2 = 192; break;   // code and data L2 cache, 192 KB, 6 ways, 64 byte lines, sectored
    825       case 0x3B: l2 = 128; break;   // code and data L2 cache, 128 KB, 2 ways, 64 byte lines, sectored
    826       case 0x3C: l2 = 256; break;   // code and data L2 cache, 256 KB, 4 ways, 64 byte lines, sectored
    827       case 0x3D: l2 = 384; break;   // code and data L2 cache, 384 KB, 6 ways, 64 byte lines, sectored
    828       case 0x3E: l2 = 512; break;   // code and data L2 cache, 512 KB, 4 ways, 64 byte lines, sectored
    829       case 0x40: l2 = 0; break;   // no integrated L2 cache (P6 core) or L3 cache (P4 core)
    830       case 0x41: l2 = 128; break;   // code and data L2 cache, 128 KB, 4 ways, 32 byte lines
    831       case 0x42: l2 = 256; break;   // code and data L2 cache, 256 KB, 4 ways, 32 byte lines
    832       case 0x43: l2 = 512; break;   // code and data L2 cache, 512 KB, 4 ways, 32 byte lines
    833       case 0x44: l2 = 1024; break;   // code and data L2 cache, 1024 KB, 4 ways, 32 byte lines
    834       case 0x45: l2 = 2048; break;   // code and data L2 cache, 2048 KB, 4 ways, 32 byte lines
    835       case 0x46: l3 = 4096; break;   // code and data L3 cache, 4096 KB, 4 ways, 64 byte lines
    836       case 0x47: l3 = 8192; break;   // code and data L3 cache, 8192 KB, 8 ways, 64 byte lines
    837       case 0x48: l2 = 3072; break;   // code and data L2 cache, 3072 KB, 12 ways, 64 byte lines
    838       case 0x49: if(l2!=0) l3 = 4096; else {check_for_p2_core2=true; l3 = l2 = 4096;} break;// code and data L3 cache, 4096 KB, 16 ways, 64 byte lines (P4) or L2 for core2
    839       case 0x4A: l3 = 6144; break;   // code and data L3 cache, 6144 KB, 12 ways, 64 byte lines
    840       case 0x4B: l3 = 8192; break;   // code and data L3 cache, 8192 KB, 16 ways, 64 byte lines
    841       case 0x4C: l3 = 12288; break;   // code and data L3 cache, 12288 KB, 12 ways, 64 byte lines
    842       case 0x4D: l3 = 16384; break;   // code and data L3 cache, 16384 KB, 16 ways, 64 byte lines
    843       case 0x4E: l2 = 6144; break;   // code and data L2 cache, 6144 KB, 24 ways, 64 byte lines
    844       case 0x78: l2 = 1024; break;   // code and data L2 cache, 1024 KB, 4 ways, 64 byte lines
    845       case 0x79: l2 = 128; break;   // code and data L2 cache, 128 KB, 8 ways, 64 byte lines, dual-sectored
    846       case 0x7A: l2 = 256; break;   // code and data L2 cache, 256 KB, 8 ways, 64 byte lines, dual-sectored
    847       case 0x7B: l2 = 512; break;   // code and data L2 cache, 512 KB, 8 ways, 64 byte lines, dual-sectored
    848       case 0x7C: l2 = 1024; break;   // code and data L2 cache, 1024 KB, 8 ways, 64 byte lines, dual-sectored
    849       case 0x7D: l2 = 2048; break;   // code and data L2 cache, 2048 KB, 8 ways, 64 byte lines
    850       case 0x7E: l2 = 256; break;   // code and data L2 cache, 256 KB, 8 ways, 128 byte lines, sect. (IA-64)
    851       case 0x7F: l2 = 512; break;   // code and data L2 cache, 512 KB, 2 ways, 64 byte lines
    852       case 0x80: l2 = 512; break;   // code and data L2 cache, 512 KB, 8 ways, 64 byte lines
    853       case 0x81: l2 = 128; break;   // code and data L2 cache, 128 KB, 8 ways, 32 byte lines
    854       case 0x82: l2 = 256; break;   // code and data L2 cache, 256 KB, 8 ways, 32 byte lines
    855       case 0x83: l2 = 512; break;   // code and data L2 cache, 512 KB, 8 ways, 32 byte lines
    856       case 0x84: l2 = 1024; break;   // code and data L2 cache, 1024 KB, 8 ways, 32 byte lines
    857       case 0x85: l2 = 2048; break;   // code and data L2 cache, 2048 KB, 8 ways, 32 byte lines
    858       case 0x86: l2 = 512; break;   // code and data L2 cache, 512 KB, 4 ways, 64 byte lines
    859       case 0x87: l2 = 1024; break;   // code and data L2 cache, 1024 KB, 8 ways, 64 byte lines
    860       case 0x88: l3 = 2048; break;   // code and data L3 cache, 2048 KB, 4 ways, 64 byte lines (IA-64)
    861       case 0x89: l3 = 4096; break;   // code and data L3 cache, 4096 KB, 4 ways, 64 byte lines (IA-64)
    862       case 0x8A: l3 = 8192; break;   // code and data L3 cache, 8192 KB, 4 ways, 64 byte lines (IA-64)
    863       case 0x8D: l3 = 3072; break;   // code and data L3 cache, 3072 KB, 12 ways, 128 byte lines (IA-64)
    864 
    865       default: break;
    866     }
    867   }
    868   if(check_for_p2_core2 && l2 == l3)
    869     l3 = 0;
    870   l1 *= 1024;
    871   l2 *= 1024;
    872   l3 *= 1024;
    873 }
    874 
    875 inline void queryCacheSizes_intel(int& l1, int& l2, int& l3, int max_std_funcs)
    876 {
    877   if(max_std_funcs>=4)
    878     queryCacheSizes_intel_direct(l1,l2,l3);
    879   else
    880     queryCacheSizes_intel_codes(l1,l2,l3);
    881 }
    882 
    883 inline void queryCacheSizes_amd(int& l1, int& l2, int& l3)
    884 {
    885   int abcd[4];
    886   abcd[0] = abcd[1] = abcd[2] = abcd[3] = 0;
    887   EIGEN_CPUID(abcd,0x80000005,0);
    888   l1 = (abcd[2] >> 24) * 1024; // C[31:24] = L1 size in KB
    889   abcd[0] = abcd[1] = abcd[2] = abcd[3] = 0;
    890   EIGEN_CPUID(abcd,0x80000006,0);
    891   l2 = (abcd[2] >> 16) * 1024; // C[31;16] = l2 cache size in KB
    892   l3 = ((abcd[3] & 0xFFFC000) >> 18) * 512 * 1024; // D[31;18] = l3 cache size in 512KB
    893 }
    894 #endif
    895 
    896 /** \internal
    897  * Queries and returns the cache sizes in Bytes of the L1, L2, and L3 data caches respectively */
    898 inline void queryCacheSizes(int& l1, int& l2, int& l3)
    899 {
    900   #ifdef EIGEN_CPUID
    901   int abcd[4];
    902 
    903   // identify the CPU vendor
    904   EIGEN_CPUID(abcd,0x0,0);
    905   int max_std_funcs = abcd[1];
    906   if(cpuid_is_vendor(abcd,"GenuineIntel"))
    907     queryCacheSizes_intel(l1,l2,l3,max_std_funcs);
    908   else if(cpuid_is_vendor(abcd,"AuthenticAMD") || cpuid_is_vendor(abcd,"AMDisbetter!"))
    909     queryCacheSizes_amd(l1,l2,l3);
    910   else
    911     // by default let's use Intel's API
    912     queryCacheSizes_intel(l1,l2,l3,max_std_funcs);
    913 
    914   // here is the list of other vendors:
    915 //   ||cpuid_is_vendor(abcd,"VIA VIA VIA ")
    916 //   ||cpuid_is_vendor(abcd,"CyrixInstead")
    917 //   ||cpuid_is_vendor(abcd,"CentaurHauls")
    918 //   ||cpuid_is_vendor(abcd,"GenuineTMx86")
    919 //   ||cpuid_is_vendor(abcd,"TransmetaCPU")
    920 //   ||cpuid_is_vendor(abcd,"RiseRiseRise")
    921 //   ||cpuid_is_vendor(abcd,"Geode by NSC")
    922 //   ||cpuid_is_vendor(abcd,"SiS SiS SiS ")
    923 //   ||cpuid_is_vendor(abcd,"UMC UMC UMC ")
    924 //   ||cpuid_is_vendor(abcd,"NexGenDriven")
    925   #else
    926   l1 = l2 = l3 = -1;
    927   #endif
    928 }
    929 
    930 /** \internal
    931  * \returns the size in Bytes of the L1 data cache */
    932 inline int queryL1CacheSize()
    933 {
    934   int l1(-1), l2, l3;
    935   queryCacheSizes(l1,l2,l3);
    936   return l1;
    937 }
    938 
    939 /** \internal
    940  * \returns the size in Bytes of the L2 or L3 cache if this later is present */
    941 inline int queryTopLevelCacheSize()
    942 {
    943   int l1, l2(-1), l3(-1);
    944   queryCacheSizes(l1,l2,l3);
    945   return (std::max)(l2,l3);
    946 }
    947 
    948 } // end namespace internal
    949 
    950 } // end namespace Eigen
    951 
    952 #endif // EIGEN_MEMORY_H
    953