Home | History | Annotate | Download | only in common
      1 /****************************************************************************
      2 * Copyright (C) 2017 Intel Corporation.   All Rights Reserved.
      3 *
      4 * Permission is hereby granted, free of charge, to any person obtaining a
      5 * copy of this software and associated documentation files (the "Software"),
      6 * to deal in the Software without restriction, including without limitation
      7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
      8 * and/or sell copies of the Software, and to permit persons to whom the
      9 * Software is furnished to do so, subject to the following conditions:
     10 *
     11 * The above copyright notice and this permission notice (including the next
     12 * paragraph) shall be included in all copies or substantial portions of the
     13 * Software.
     14 *
     15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
     16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
     17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
     18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
     19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
     20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
     21 * IN THE SOFTWARE.
     22 ****************************************************************************/
     23 #pragma once
     24 
     25 #include "simdlib_types.hpp"
     26 
     27 // For documentation, please see the following include...
     28 // #include "simdlib_interface.hpp"
     29 
     30 namespace SIMDImpl
     31 {
     32     namespace SIMD128Impl
     33     {
     34 #if SIMD_ARCH >= SIMD_ARCH_AVX
     35         struct AVXImpl
     36         {
     37 #define __SIMD_LIB_AVX_HPP__
     38 #include "simdlib_128_avx.inl"
     39 #undef __SIMD_LIB_AVX_HPP__
     40         }; // struct AVXImpl
     41 #endif // #if SIMD_ARCH >= SIMD_ARCH_AVX
     42 
     43 
     44 #if SIMD_ARCH >= SIMD_ARCH_AVX2
     45         struct AVX2Impl : AVXImpl
     46         {
     47 #define __SIMD_LIB_AVX2_HPP__
     48 #include "simdlib_128_avx2.inl"
     49 #undef __SIMD_LIB_AVX2_HPP__
     50         }; // struct AVX2Impl
     51 #endif // #if SIMD_ARCH >= SIMD_ARCH_AVX2
     52 
     53 #if SIMD_ARCH >= SIMD_ARCH_AVX512
     54         struct AVX512Impl : AVX2Impl
     55         {
     56 #if defined(SIMD_OPT_128_AVX512)
     57 #define __SIMD_LIB_AVX512_HPP__
     58 #include "simdlib_128_avx512.inl"
     59 #if defined(SIMD_ARCH_KNIGHTS)
     60 #include "simdlib_128_avx512_knights.inl"
     61 #else // optimize for core
     62 #include "simdlib_128_avx512_core.inl"
     63 #endif // defined(SIMD_ARCH_KNIGHTS)
     64 #undef __SIMD_LIB_AVX512_HPP__
     65 #endif // SIMD_OPT_128_AVX512
     66         }; // struct AVX2Impl
     67 #endif // #if SIMD_ARCH >= SIMD_ARCH_AVX512
     68 
     69         struct Traits : SIMDImpl::Traits
     70         {
     71 #if SIMD_ARCH == SIMD_ARCH_AVX
     72             using IsaImpl = AVXImpl;
     73 #elif SIMD_ARCH == SIMD_ARCH_AVX2
     74             using IsaImpl = AVX2Impl;
     75 #elif SIMD_ARCH == SIMD_ARCH_AVX512
     76             using IsaImpl = AVX512Impl;
     77 #else
     78 #error Invalid value for SIMD_ARCH
     79 #endif
     80 
     81             using Float     = SIMD128Impl::Float;
     82             using Double    = SIMD128Impl::Double;
     83             using Integer   = SIMD128Impl::Integer;
     84             using Vec4      = SIMD128Impl::Vec4;
     85             using Mask      = SIMD128Impl::Mask;
     86         };
     87     } // ns SIMD128Impl
     88 
     89     namespace SIMD256Impl
     90     {
     91 #if SIMD_ARCH >= SIMD_ARCH_AVX
     92         struct AVXImpl
     93         {
     94 #define __SIMD_LIB_AVX_HPP__
     95 #include "simdlib_256_avx.inl"
     96 #undef __SIMD_LIB_AVX_HPP__
     97         }; // struct AVXImpl
     98 #endif // #if SIMD_ARCH >= SIMD_ARCH_AVX
     99 
    100 
    101 #if SIMD_ARCH >= SIMD_ARCH_AVX2
    102         struct AVX2Impl : AVXImpl
    103         {
    104 #define __SIMD_LIB_AVX2_HPP__
    105 #include "simdlib_256_avx2.inl"
    106 #undef __SIMD_LIB_AVX2_HPP__
    107         }; // struct AVX2Impl
    108 #endif // #if SIMD_ARCH >= SIMD_ARCH_AVX2
    109 
    110 #if SIMD_ARCH >= SIMD_ARCH_AVX512
    111         struct AVX512Impl : AVX2Impl
    112         {
    113 #if defined(SIMD_OPT_256_AVX512)
    114 #define __SIMD_LIB_AVX512_HPP__
    115 #include "simdlib_256_avx512.inl"
    116 #if defined(SIMD_ARCH_KNIGHTS)
    117 #include "simdlib_256_avx512_knights.inl"
    118 #else // optimize for core
    119 #include "simdlib_256_avx512_core.inl"
    120 #endif // defined(SIMD_ARCH_KNIGHTS)
    121 #undef __SIMD_LIB_AVX512_HPP__
    122 #endif // SIMD_OPT_256_AVX512
    123         }; // struct AVX2Impl
    124 #endif // #if SIMD_ARCH >= SIMD_ARCH_AVX512
    125 
    126         struct Traits : SIMDImpl::Traits
    127         {
    128 #if SIMD_ARCH == SIMD_ARCH_AVX
    129             using IsaImpl = AVXImpl;
    130 #elif SIMD_ARCH == SIMD_ARCH_AVX2
    131             using IsaImpl = AVX2Impl;
    132 #elif SIMD_ARCH == SIMD_ARCH_AVX512
    133             using IsaImpl = AVX512Impl;
    134 #else
    135 #error Invalid value for SIMD_ARCH
    136 #endif
    137 
    138             using Float     = SIMD256Impl::Float;
    139             using Double    = SIMD256Impl::Double;
    140             using Integer   = SIMD256Impl::Integer;
    141             using Vec4      = SIMD256Impl::Vec4;
    142             using Mask      = SIMD256Impl::Mask;
    143         };
    144     } // ns SIMD256Impl
    145 
    146     namespace SIMD512Impl
    147     {
    148 #if SIMD_ARCH >= SIMD_ARCH_AVX
    149         template<typename SIMD256T>
    150         struct AVXImplBase
    151         {
    152 #define __SIMD_LIB_AVX_HPP__
    153 #include "simdlib_512_emu.inl"
    154 #include "simdlib_512_emu_masks.inl"
    155 #undef __SIMD_LIB_AVX_HPP__
    156         }; // struct AVXImplBase
    157         using AVXImpl = AVXImplBase<SIMD256Impl::AVXImpl>;
    158 #endif // #if SIMD_ARCH >= SIMD_ARCH_AVX
    159 
    160 
    161 #if SIMD_ARCH >= SIMD_ARCH_AVX2
    162         using AVX2Impl = AVXImplBase<SIMD256Impl::AVX2Impl>;
    163 #endif // #if SIMD_ARCH >= SIMD_ARCH_AVX2
    164 
    165 
    166 #if SIMD_ARCH >= SIMD_ARCH_AVX512
    167         struct AVX512Impl : AVXImplBase<SIMD256Impl::AVX512Impl>
    168         {
    169 #define __SIMD_LIB_AVX512_HPP__
    170 #include "simdlib_512_avx512.inl"
    171 #include "simdlib_512_avx512_masks.inl"
    172 #if defined(SIMD_ARCH_KNIGHTS)
    173 #include "simdlib_512_avx512_knights.inl"
    174 #include "simdlib_512_avx512_masks_knights.inl"
    175 #else // optimize for core
    176 #include "simdlib_512_avx512_core.inl"
    177 #include "simdlib_512_avx512_masks_core.inl"
    178 #endif // defined(SIMD_ARCH_KNIGHTS)
    179 #undef __SIMD_LIB_AVX512_HPP__
    180         }; // struct AVX512ImplBase
    181 #endif // #if SIMD_ARCH >= SIMD_ARCH_AVX512
    182 
    183         struct Traits : SIMDImpl::Traits
    184         {
    185 #if SIMD_ARCH == SIMD_ARCH_AVX
    186             using IsaImpl = AVXImpl;
    187 #elif SIMD_ARCH == SIMD_ARCH_AVX2
    188             using IsaImpl = AVX2Impl;
    189 #elif SIMD_ARCH == SIMD_ARCH_AVX512
    190             using IsaImpl = AVX512Impl;
    191 #else
    192 #error Invalid value for SIMD_ARCH
    193 #endif
    194 
    195             using Float     = SIMD512Impl::Float;
    196             using Double    = SIMD512Impl::Double;
    197             using Integer   = SIMD512Impl::Integer;
    198             using Vec4      = SIMD512Impl::Vec4;
    199             using Mask      = SIMD512Impl::Mask;
    200         };
    201     } // ns SIMD512Impl
    202 } // ns SIMDImpl
    203 
    204 template <typename Traits>
    205 struct SIMDBase : Traits::IsaImpl
    206 {
    207     using CompareType   = typename Traits::CompareType;
    208     using ScaleFactor   = typename Traits::ScaleFactor;
    209     using RoundMode     = typename Traits::RoundMode;
    210     using SIMD          = typename Traits::IsaImpl;
    211     using Float         = typename Traits::Float;
    212     using Double        = typename Traits::Double;
    213     using Integer       = typename Traits::Integer;
    214     using Vec4          = typename Traits::Vec4;
    215     using Mask          = typename Traits::Mask;
    216 
    217     static const size_t VECTOR_BYTES = sizeof(Float);
    218 
    219     // Populates a SIMD Vec4 from a non-simd vector. So p = xyzw becomes xxxx yyyy zzzz wwww.
    220     static SIMDINLINE
    221     void vec4_load1_ps(Vec4& r, const float *p)
    222     {
    223         r[0] = SIMD::set1_ps(p[0]);
    224         r[1] = SIMD::set1_ps(p[1]);
    225         r[2] = SIMD::set1_ps(p[2]);
    226         r[3] = SIMD::set1_ps(p[3]);
    227     }
    228 
    229     static SIMDINLINE
    230     void vec4_set1_vps(Vec4& r, Float const &s)
    231     {
    232         r[0] = s;
    233         r[1] = s;
    234         r[2] = s;
    235         r[3] = s;
    236     }
    237 
    238     static SIMDINLINE
    239     Float vec4_dp3_ps(const Vec4& v0, const Vec4& v1)
    240     {
    241         Float tmp, r;
    242         r   = SIMD::mul_ps(v0[0], v1[0]);     // (v0.x*v1.x)
    243 
    244         tmp = SIMD::mul_ps(v0[1], v1[1]);     // (v0.y*v1.y)
    245         r   = SIMD::add_ps(r, tmp);           // (v0.x*v1.x) + (v0.y*v1.y)
    246 
    247         tmp = SIMD::mul_ps(v0[2], v1[2]);     // (v0.z*v1.z)
    248         r   = SIMD::add_ps(r, tmp);           // (v0.x*v1.x) + (v0.y*v1.y) + (v0.z*v1.z)
    249 
    250         return r;
    251     }
    252 
    253     static SIMDINLINE
    254     Float vec4_dp4_ps(const Vec4& v0, const Vec4& v1)
    255     {
    256         Float tmp, r;
    257         r   = SIMD::mul_ps(v0[0], v1[0]);     // (v0.x*v1.x)
    258 
    259         tmp = SIMD::mul_ps(v0[1], v1[1]);     // (v0.y*v1.y)
    260         r   = SIMD::add_ps(r, tmp);           // (v0.x*v1.x) + (v0.y*v1.y)
    261 
    262         tmp = SIMD::mul_ps(v0[2], v1[2]);     // (v0.z*v1.z)
    263         r   = SIMD::add_ps(r, tmp);           // (v0.x*v1.x) + (v0.y*v1.y) + (v0.z*v1.z)
    264 
    265         tmp = SIMD::mul_ps(v0[3], v1[3]);     // (v0.w*v1.w)
    266         r   = SIMD::add_ps(r, tmp);           // (v0.x*v1.x) + (v0.y*v1.y) + (v0.z*v1.z)
    267 
    268         return r;
    269     }
    270 
    271     static SIMDINLINE
    272     Float vec4_rcp_length_ps(const Vec4& v)
    273     {
    274         Float length = vec4_dp4_ps(v, v);
    275         return SIMD::rsqrt_ps(length);
    276     }
    277 
    278     static SIMDINLINE
    279     void vec4_normalize_ps(Vec4& r, const Vec4& v)
    280     {
    281         Float rcpLength = vec4_rcp_length_ps(v);
    282 
    283         r[0] = SIMD::mul_ps(v[0], rcpLength);
    284         r[1] = SIMD::mul_ps(v[1], rcpLength);
    285         r[2] = SIMD::mul_ps(v[2], rcpLength);
    286         r[3] = SIMD::mul_ps(v[3], rcpLength);
    287     }
    288 
    289     static SIMDINLINE
    290     void vec4_mul_ps(Vec4& r, const Vec4& v, Float const &s)
    291     {
    292         r[0] = SIMD::mul_ps(v[0], s);
    293         r[1] = SIMD::mul_ps(v[1], s);
    294         r[2] = SIMD::mul_ps(v[2], s);
    295         r[3] = SIMD::mul_ps(v[3], s);
    296     }
    297 
    298     static SIMDINLINE
    299     void vec4_mul_ps(Vec4& r, const Vec4& v0, const Vec4& v1)
    300     {
    301         r[0] = SIMD::mul_ps(v0[0], v1[0]);
    302         r[1] = SIMD::mul_ps(v0[1], v1[1]);
    303         r[2] = SIMD::mul_ps(v0[2], v1[2]);
    304         r[3] = SIMD::mul_ps(v0[3], v1[3]);
    305     }
    306 
    307     static SIMDINLINE
    308     void vec4_add_ps(Vec4& r, const Vec4& v0, Float const &s)
    309     {
    310         r[0] = SIMD::add_ps(v0[0], s);
    311         r[1] = SIMD::add_ps(v0[1], s);
    312         r[2] = SIMD::add_ps(v0[2], s);
    313         r[3] = SIMD::add_ps(v0[3], s);
    314     }
    315 
    316     static SIMDINLINE
    317     void vec4_add_ps(Vec4& r, const Vec4& v0, const Vec4& v1)
    318     {
    319         r[0] = SIMD::add_ps(v0[0], v1[0]);
    320         r[1] = SIMD::add_ps(v0[1], v1[1]);
    321         r[2] = SIMD::add_ps(v0[2], v1[2]);
    322         r[3] = SIMD::add_ps(v0[3], v1[3]);
    323     }
    324 
    325     static SIMDINLINE
    326     void vec4_min_ps(Vec4& r, const Vec4& v0, Float const &s)
    327     {
    328         r[0] = SIMD::min_ps(v0[0], s);
    329         r[1] = SIMD::min_ps(v0[1], s);
    330         r[2] = SIMD::min_ps(v0[2], s);
    331         r[3] = SIMD::min_ps(v0[3], s);
    332     }
    333 
    334     static SIMDINLINE
    335     void vec4_max_ps(Vec4& r, const Vec4& v0, Float const &s)
    336     {
    337         r[0] = SIMD::max_ps(v0[0], s);
    338         r[1] = SIMD::max_ps(v0[1], s);
    339         r[2] = SIMD::max_ps(v0[2], s);
    340         r[3] = SIMD::max_ps(v0[3], s);
    341     }
    342 
    343     // Matrix4x4 * Vector4
    344     //   outVec.x = (m00 * v.x) + (m01 * v.y) + (m02 * v.z) + (m03 * v.w)
    345     //   outVec.y = (m10 * v.x) + (m11 * v.y) + (m12 * v.z) + (m13 * v.w)
    346     //   outVec.z = (m20 * v.x) + (m21 * v.y) + (m22 * v.z) + (m23 * v.w)
    347     //   outVec.w = (m30 * v.x) + (m31 * v.y) + (m32 * v.z) + (m33 * v.w)
    348     static SIMDINLINE
    349     void SIMDCALL mat4x4_vec4_multiply(
    350         Vec4& result,
    351         const float *pMatrix,
    352         const Vec4& v)
    353     {
    354         Float m;
    355         Float r0;
    356         Float r1;
    357 
    358         m   = SIMD::load1_ps(pMatrix + 0*4 + 0);  // m[row][0]
    359         r0  = SIMD::mul_ps(m, v[0]);              // (m00 * v.x)
    360         m   = SIMD::load1_ps(pMatrix + 0*4 + 1);  // m[row][1]
    361         r1  = SIMD::mul_ps(m, v[1]);              // (m1 * v.y)
    362         r0  = SIMD::add_ps(r0, r1);               // (m0 * v.x) + (m1 * v.y)
    363         m   = SIMD::load1_ps(pMatrix + 0*4 + 2);  // m[row][2]
    364         r1  = SIMD::mul_ps(m, v[2]);              // (m2 * v.z)
    365         r0  = SIMD::add_ps(r0, r1);               // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
    366         m   = SIMD::load1_ps(pMatrix + 0*4 + 3);  // m[row][3]
    367         r1  = SIMD::mul_ps(m, v[3]);              // (m3 * v.z)
    368         r0  = SIMD::add_ps(r0, r1);               // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * v.w)
    369         result[0] = r0;
    370 
    371         m   = SIMD::load1_ps(pMatrix + 1*4 + 0);  // m[row][0]
    372         r0  = SIMD::mul_ps(m, v[0]);              // (m00 * v.x)
    373         m   = SIMD::load1_ps(pMatrix + 1*4 + 1);  // m[row][1]
    374         r1  = SIMD::mul_ps(m, v[1]);              // (m1 * v.y)
    375         r0  = SIMD::add_ps(r0, r1);               // (m0 * v.x) + (m1 * v.y)
    376         m   = SIMD::load1_ps(pMatrix + 1*4 + 2);  // m[row][2]
    377         r1  = SIMD::mul_ps(m, v[2]);              // (m2 * v.z)
    378         r0  = SIMD::add_ps(r0, r1);               // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
    379         m   = SIMD::load1_ps(pMatrix + 1*4 + 3);  // m[row][3]
    380         r1  = SIMD::mul_ps(m, v[3]);              // (m3 * v.z)
    381         r0  = SIMD::add_ps(r0, r1);               // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * v.w)
    382         result[1] = r0;
    383 
    384         m   = SIMD::load1_ps(pMatrix + 2*4 + 0);  // m[row][0]
    385         r0  = SIMD::mul_ps(m, v[0]);              // (m00 * v.x)
    386         m   = SIMD::load1_ps(pMatrix + 2*4 + 1);  // m[row][1]
    387         r1  = SIMD::mul_ps(m, v[1]);              // (m1 * v.y)
    388         r0  = SIMD::add_ps(r0, r1);               // (m0 * v.x) + (m1 * v.y)
    389         m   = SIMD::load1_ps(pMatrix + 2*4 + 2);  // m[row][2]
    390         r1  = SIMD::mul_ps(m, v[2]);              // (m2 * v.z)
    391         r0  = SIMD::add_ps(r0, r1);               // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
    392         m   = SIMD::load1_ps(pMatrix + 2*4 + 3);  // m[row][3]
    393         r1  = SIMD::mul_ps(m, v[3]);              // (m3 * v.z)
    394         r0  = SIMD::add_ps(r0, r1);               // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * v.w)
    395         result[2] = r0;
    396 
    397         m   = SIMD::load1_ps(pMatrix + 3*4 + 0);  // m[row][0]
    398         r0  = SIMD::mul_ps(m, v[0]);              // (m00 * v.x)
    399         m   = SIMD::load1_ps(pMatrix + 3*4 + 1);  // m[row][1]
    400         r1  = SIMD::mul_ps(m, v[1]);              // (m1 * v.y)
    401         r0  = SIMD::add_ps(r0, r1);               // (m0 * v.x) + (m1 * v.y)
    402         m   = SIMD::load1_ps(pMatrix + 3*4 + 2);  // m[row][2]
    403         r1  = SIMD::mul_ps(m, v[2]);              // (m2 * v.z)
    404         r0  = SIMD::add_ps(r0, r1);               // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
    405         m   = SIMD::load1_ps(pMatrix + 3*4 + 3);  // m[row][3]
    406         r1  = SIMD::mul_ps(m, v[3]);              // (m3 * v.z)
    407         r0  = SIMD::add_ps(r0, r1);               // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * v.w)
    408         result[3] = r0;
    409     }
    410 
    411     // Matrix4x4 * Vector3 - Direction Vector where w = 0.
    412     //   outVec.x = (m00 * v.x) + (m01 * v.y) + (m02 * v.z) + (m03 * 0)
    413     //   outVec.y = (m10 * v.x) + (m11 * v.y) + (m12 * v.z) + (m13 * 0)
    414     //   outVec.z = (m20 * v.x) + (m21 * v.y) + (m22 * v.z) + (m23 * 0)
    415     //   outVec.w = (m30 * v.x) + (m31 * v.y) + (m32 * v.z) + (m33 * 0)
    416     static SIMDINLINE
    417     void SIMDCALL mat3x3_vec3_w0_multiply(
    418         Vec4& result,
    419         const float *pMatrix,
    420         const Vec4& v)
    421     {
    422         Float m;
    423         Float r0;
    424         Float r1;
    425 
    426         m   = SIMD::load1_ps(pMatrix + 0*4 + 0);  // m[row][0]
    427         r0  = SIMD::mul_ps(m, v[0]);              // (m00 * v.x)
    428         m   = SIMD::load1_ps(pMatrix + 0*4 + 1);  // m[row][1]
    429         r1  = SIMD::mul_ps(m, v[1]);              // (m1 * v.y)
    430         r0  = SIMD::add_ps(r0, r1);               // (m0 * v.x) + (m1 * v.y)
    431         m   = SIMD::load1_ps(pMatrix + 0*4 + 2);  // m[row][2]
    432         r1  = SIMD::mul_ps(m, v[2]);              // (m2 * v.z)
    433         r0  = SIMD::add_ps(r0, r1);               // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
    434         result[0] = r0;
    435 
    436         m   = SIMD::load1_ps(pMatrix + 1*4 + 0);  // m[row][0]
    437         r0  = SIMD::mul_ps(m, v[0]);              // (m00 * v.x)
    438         m   = SIMD::load1_ps(pMatrix + 1*4 + 1);  // m[row][1]
    439         r1  = SIMD::mul_ps(m, v[1]);              // (m1 * v.y)
    440         r0  = SIMD::add_ps(r0, r1);               // (m0 * v.x) + (m1 * v.y)
    441         m   = SIMD::load1_ps(pMatrix + 1*4 + 2);  // m[row][2]
    442         r1  = SIMD::mul_ps(m, v[2]);              // (m2 * v.z)
    443         r0  = SIMD::add_ps(r0, r1);               // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
    444         result[1] = r0;
    445 
    446         m   = SIMD::load1_ps(pMatrix + 2*4 + 0);  // m[row][0]
    447         r0  = SIMD::mul_ps(m, v[0]);              // (m00 * v.x)
    448         m   = SIMD::load1_ps(pMatrix + 2*4 + 1);  // m[row][1]
    449         r1  = SIMD::mul_ps(m, v[1]);              // (m1 * v.y)
    450         r0  = SIMD::add_ps(r0, r1);               // (m0 * v.x) + (m1 * v.y)
    451         m   = SIMD::load1_ps(pMatrix + 2*4 + 2);  // m[row][2]
    452         r1  = SIMD::mul_ps(m, v[2]);              // (m2 * v.z)
    453         r0  = SIMD::add_ps(r0, r1);               // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
    454         result[2] = r0;
    455 
    456         result[3] = SIMD::setzero_ps();
    457     }
    458 
    459     // Matrix4x4 * Vector3 - Position vector where w = 1.
    460     //   outVec.x = (m00 * v.x) + (m01 * v.y) + (m02 * v.z) + (m03 * 1)
    461     //   outVec.y = (m10 * v.x) + (m11 * v.y) + (m12 * v.z) + (m13 * 1)
    462     //   outVec.z = (m20 * v.x) + (m21 * v.y) + (m22 * v.z) + (m23 * 1)
    463     //   outVec.w = (m30 * v.x) + (m31 * v.y) + (m32 * v.z) + (m33 * 1)
    464     static SIMDINLINE
    465     void SIMDCALL mat4x4_vec3_w1_multiply(
    466         Vec4& result,
    467         const float *pMatrix,
    468         const Vec4& v)
    469     {
    470         Float m;
    471         Float r0;
    472         Float r1;
    473 
    474         m   = SIMD::load1_ps(pMatrix + 0*4 + 0);  // m[row][0]
    475         r0  = SIMD::mul_ps(m, v[0]);              // (m00 * v.x)
    476         m   = SIMD::load1_ps(pMatrix + 0*4 + 1);  // m[row][1]
    477         r1  = SIMD::mul_ps(m, v[1]);              // (m1 * v.y)
    478         r0  = SIMD::add_ps(r0, r1);               // (m0 * v.x) + (m1 * v.y)
    479         m   = SIMD::load1_ps(pMatrix + 0*4 + 2);  // m[row][2]
    480         r1  = SIMD::mul_ps(m, v[2]);              // (m2 * v.z)
    481         r0  = SIMD::add_ps(r0, r1);               // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
    482         m   = SIMD::load1_ps(pMatrix + 0*4 + 3);  // m[row][3]
    483         r0  = SIMD::add_ps(r0, m);                // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
    484         result[0] = r0;
    485 
    486         m   = SIMD::load1_ps(pMatrix + 1*4 + 0);  // m[row][0]
    487         r0  = SIMD::mul_ps(m, v[0]);              // (m00 * v.x)
    488         m   = SIMD::load1_ps(pMatrix + 1*4 + 1);  // m[row][1]
    489         r1  = SIMD::mul_ps(m, v[1]);              // (m1 * v.y)
    490         r0  = SIMD::add_ps(r0, r1);               // (m0 * v.x) + (m1 * v.y)
    491         m   = SIMD::load1_ps(pMatrix + 1*4 + 2);  // m[row][2]
    492         r1  = SIMD::mul_ps(m, v[2]);              // (m2 * v.z)
    493         r0  = SIMD::add_ps(r0, r1);               // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
    494         m   = SIMD::load1_ps(pMatrix + 1*4 + 3);  // m[row][3]
    495         r0  = SIMD::add_ps(r0, m);                // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
    496         result[1] = r0;
    497 
    498         m   = SIMD::load1_ps(pMatrix + 2*4 + 0);  // m[row][0]
    499         r0  = SIMD::mul_ps(m, v[0]);              // (m00 * v.x)
    500         m   = SIMD::load1_ps(pMatrix + 2*4 + 1);  // m[row][1]
    501         r1  = SIMD::mul_ps(m, v[1]);              // (m1 * v.y)
    502         r0  = SIMD::add_ps(r0, r1);               // (m0 * v.x) + (m1 * v.y)
    503         m   = SIMD::load1_ps(pMatrix + 2*4 + 2);  // m[row][2]
    504         r1  = SIMD::mul_ps(m, v[2]);              // (m2 * v.z)
    505         r0  = SIMD::add_ps(r0, r1);               // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
    506         m   = SIMD::load1_ps(pMatrix + 2*4 + 3);  // m[row][3]
    507         r0  = SIMD::add_ps(r0, m);                // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
    508         result[2] = r0;
    509 
    510         m   = SIMD::load1_ps(pMatrix + 3*4 + 0);  // m[row][0]
    511         r0  = SIMD::mul_ps(m, v[0]);              // (m00 * v.x)
    512         m   = SIMD::load1_ps(pMatrix + 3*4 + 1);  // m[row][1]
    513         r1  = SIMD::mul_ps(m, v[1]);              // (m1 * v.y)
    514         r0  = SIMD::add_ps(r0, r1);               // (m0 * v.x) + (m1 * v.y)
    515         m   = SIMD::load1_ps(pMatrix + 3*4 + 2);  // m[row][2]
    516         r1  = SIMD::mul_ps(m, v[2]);              // (m2 * v.z)
    517         r0  = SIMD::add_ps(r0, r1);               // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
    518         m   = SIMD::load1_ps(pMatrix + 3*4 + 3);  // m[row][3]
    519         result[3] = SIMD::add_ps(r0, m);        // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
    520     }
    521 
    522     static SIMDINLINE
    523     void SIMDCALL mat4x3_vec3_w1_multiply(
    524         Vec4& result,
    525         const float *pMatrix,
    526         const Vec4& v)
    527     {
    528         Float m;
    529         Float r0;
    530         Float r1;
    531 
    532         m   = SIMD::load1_ps(pMatrix + 0*4 + 0);  // m[row][0]
    533         r0  = SIMD::mul_ps(m, v[0]);              // (m00 * v.x)
    534         m   = SIMD::load1_ps(pMatrix + 0*4 + 1);  // m[row][1]
    535         r1  = SIMD::mul_ps(m, v[1]);              // (m1 * v.y)
    536         r0  = SIMD::add_ps(r0, r1);               // (m0 * v.x) + (m1 * v.y)
    537         m   = SIMD::load1_ps(pMatrix + 0*4 + 2);  // m[row][2]
    538         r1  = SIMD::mul_ps(m, v[2]);              // (m2 * v.z)
    539         r0  = SIMD::add_ps(r0, r1);               // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
    540         m   = SIMD::load1_ps(pMatrix + 0*4 + 3);  // m[row][3]
    541         r0  = SIMD::add_ps(r0, m);                // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
    542         result[0] = r0;
    543 
    544         m   = SIMD::load1_ps(pMatrix + 1*4 + 0);  // m[row][0]
    545         r0  = SIMD::mul_ps(m, v[0]);              // (m00 * v.x)
    546         m   = SIMD::load1_ps(pMatrix + 1*4 + 1);  // m[row][1]
    547         r1  = SIMD::mul_ps(m, v[1]);              // (m1 * v.y)
    548         r0  = SIMD::add_ps(r0, r1);               // (m0 * v.x) + (m1 * v.y)
    549         m   = SIMD::load1_ps(pMatrix + 1*4 + 2);  // m[row][2]
    550         r1  = SIMD::mul_ps(m, v[2]);              // (m2 * v.z)
    551         r0  = SIMD::add_ps(r0, r1);               // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
    552         m   = SIMD::load1_ps(pMatrix + 1*4 + 3);  // m[row][3]
    553         r0  = SIMD::add_ps(r0, m);                // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
    554         result[1] = r0;
    555 
    556         m   = SIMD::load1_ps(pMatrix + 2*4 + 0);  // m[row][0]
    557         r0  = SIMD::mul_ps(m, v[0]);              // (m00 * v.x)
    558         m   = SIMD::load1_ps(pMatrix + 2*4 + 1);  // m[row][1]
    559         r1  = SIMD::mul_ps(m, v[1]);              // (m1 * v.y)
    560         r0  = SIMD::add_ps(r0, r1);               // (m0 * v.x) + (m1 * v.y)
    561         m   = SIMD::load1_ps(pMatrix + 2*4 + 2);  // m[row][2]
    562         r1  = SIMD::mul_ps(m, v[2]);              // (m2 * v.z)
    563         r0  = SIMD::add_ps(r0, r1);               // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
    564         m   = SIMD::load1_ps(pMatrix + 2*4 + 3);  // m[row][3]
    565         r0  = SIMD::add_ps(r0, m);                // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
    566         result[2] = r0;
    567         result[3] = SIMD::set1_ps(1.0f);
    568     }
    569 }; // struct SIMDBase
    570 
    571 using SIMD128 = SIMDBase<SIMDImpl::SIMD128Impl::Traits>;
    572 using SIMD256 = SIMDBase<SIMDImpl::SIMD256Impl::Traits>;
    573 using SIMD512 = SIMDBase<SIMDImpl::SIMD512Impl::Traits>;
    574