Home | History | Annotate | Download | only in LinearMath
      1 /*
      2  Copyright (c) 2011 Apple Inc.
      3  http://continuousphysics.com/Bullet/
      4 
      5  This software is provided 'as-is', without any express or implied warranty.
      6  In no event will the authors be held liable for any damages arising from the use of this software.
      7  Permission is granted to anyone to use this software for any purpose,
      8  including commercial applications, and to alter it and redistribute it freely,
      9  subject to the following restrictions:
     10 
     11  1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
     12  2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
     13  3. This notice may not be removed or altered from any source distribution.
     14 
     15  This source version has been altered.
     16  */
     17 
     18 #if defined (_WIN32) || defined (__i386__)
     19 #define BT_USE_SSE_IN_API
     20 #endif
     21 
     22 
     23 #include "btVector3.h"
     24 
     25 
     26 
     27 #if defined BT_USE_SIMD_VECTOR3
     28 
     29 #if DEBUG
     30 #include <string.h>//for memset
     31 #endif
     32 
     33 
     34 #ifdef __APPLE__
     35 #include <stdint.h>
     36 typedef  float float4 __attribute__ ((vector_size(16)));
     37 #else
     38 #define float4 __m128
     39 #endif
     40 //typedef  uint32_t uint4 __attribute__ ((vector_size(16)));
     41 
     42 
     43 #if defined BT_USE_SSE || defined _WIN32
     44 
     45 #define LOG2_ARRAY_SIZE     6
     46 #define STACK_ARRAY_COUNT   (1UL << LOG2_ARRAY_SIZE)
     47 
     48 #include <emmintrin.h>
     49 
     50 long _maxdot_large( const float *vv, const float *vec, unsigned long count, float *dotResult );
     51 long _maxdot_large( const float *vv, const float *vec, unsigned long count, float *dotResult )
     52 {
     53     const float4 *vertices = (const float4*) vv;
     54     static const unsigned char indexTable[16] = {(unsigned char)-1, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0 };
     55     float4 dotMax = btAssign128( -BT_INFINITY,  -BT_INFINITY,  -BT_INFINITY,  -BT_INFINITY );
     56     float4 vvec = _mm_loadu_ps( vec );
     57     float4 vHi = btCastiTo128f(_mm_shuffle_epi32( btCastfTo128i( vvec), 0xaa ));          /// zzzz
     58     float4 vLo = _mm_movelh_ps( vvec, vvec );                               /// xyxy
     59 
     60     long maxIndex = -1L;
     61 
     62     size_t segment = 0;
     63     float4 stack_array[ STACK_ARRAY_COUNT ];
     64 
     65 #if DEBUG
     66     //memset( stack_array, -1, STACK_ARRAY_COUNT * sizeof(stack_array[0]) );
     67 #endif
     68 
     69     size_t index;
     70     float4 max;
     71     // Faster loop without cleanup code for full tiles
     72     for ( segment = 0; segment + STACK_ARRAY_COUNT*4 <= count; segment += STACK_ARRAY_COUNT*4 )
     73     {
     74         max = dotMax;
     75 
     76         for( index = 0; index < STACK_ARRAY_COUNT; index+= 4 )
     77         { // do four dot products at a time. Carefully avoid touching the w element.
     78             float4 v0 = vertices[0];
     79             float4 v1 = vertices[1];
     80             float4 v2 = vertices[2];
     81             float4 v3 = vertices[3];            vertices += 4;
     82 
     83             float4 lo0 = _mm_movelh_ps( v0, v1);    // x0y0x1y1
     84             float4 hi0 = _mm_movehl_ps( v1, v0);    // z0?0z1?1
     85             float4 lo1 = _mm_movelh_ps( v2, v3);    // x2y2x3y3
     86             float4 hi1 = _mm_movehl_ps( v3, v2);    // z2?2z3?3
     87 
     88             lo0 = lo0*vLo;
     89             lo1 = lo1*vLo;
     90             float4 z = _mm_shuffle_ps(hi0, hi1, 0x88);
     91             float4 x = _mm_shuffle_ps(lo0, lo1, 0x88);
     92             float4 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
     93             z = z*vHi;
     94             x = x+y;
     95             x = x+z;
     96             stack_array[index] = x;
     97             max = _mm_max_ps( x, max );         // control the order here so that max is never NaN even if x is nan
     98 
     99             v0 = vertices[0];
    100             v1 = vertices[1];
    101             v2 = vertices[2];
    102             v3 = vertices[3];            vertices += 4;
    103 
    104             lo0 = _mm_movelh_ps( v0, v1);    // x0y0x1y1
    105             hi0 = _mm_movehl_ps( v1, v0);    // z0?0z1?1
    106             lo1 = _mm_movelh_ps( v2, v3);    // x2y2x3y3
    107             hi1 = _mm_movehl_ps( v3, v2);    // z2?2z3?3
    108 
    109             lo0 = lo0*vLo;
    110             lo1 = lo1*vLo;
    111             z = _mm_shuffle_ps(hi0, hi1, 0x88);
    112             x = _mm_shuffle_ps(lo0, lo1, 0x88);
    113             y = _mm_shuffle_ps(lo0, lo1, 0xdd);
    114             z = z*vHi;
    115             x = x+y;
    116             x = x+z;
    117             stack_array[index+1] = x;
    118             max = _mm_max_ps( x, max );         // control the order here so that max is never NaN even if x is nan
    119 
    120             v0 = vertices[0];
    121             v1 = vertices[1];
    122             v2 = vertices[2];
    123             v3 = vertices[3];            vertices += 4;
    124 
    125             lo0 = _mm_movelh_ps( v0, v1);    // x0y0x1y1
    126             hi0 = _mm_movehl_ps( v1, v0);    // z0?0z1?1
    127             lo1 = _mm_movelh_ps( v2, v3);    // x2y2x3y3
    128             hi1 = _mm_movehl_ps( v3, v2);    // z2?2z3?3
    129 
    130             lo0 = lo0*vLo;
    131             lo1 = lo1*vLo;
    132             z = _mm_shuffle_ps(hi0, hi1, 0x88);
    133             x = _mm_shuffle_ps(lo0, lo1, 0x88);
    134             y = _mm_shuffle_ps(lo0, lo1, 0xdd);
    135             z = z*vHi;
    136             x = x+y;
    137             x = x+z;
    138             stack_array[index+2] = x;
    139             max = _mm_max_ps( x, max );         // control the order here so that max is never NaN even if x is nan
    140 
    141             v0 = vertices[0];
    142             v1 = vertices[1];
    143             v2 = vertices[2];
    144             v3 = vertices[3];            vertices += 4;
    145 
    146             lo0 = _mm_movelh_ps( v0, v1);    // x0y0x1y1
    147             hi0 = _mm_movehl_ps( v1, v0);    // z0?0z1?1
    148             lo1 = _mm_movelh_ps( v2, v3);    // x2y2x3y3
    149             hi1 = _mm_movehl_ps( v3, v2);    // z2?2z3?3
    150 
    151             lo0 = lo0*vLo;
    152             lo1 = lo1*vLo;
    153             z = _mm_shuffle_ps(hi0, hi1, 0x88);
    154             x = _mm_shuffle_ps(lo0, lo1, 0x88);
    155             y = _mm_shuffle_ps(lo0, lo1, 0xdd);
    156             z = z*vHi;
    157             x = x+y;
    158             x = x+z;
    159             stack_array[index+3] = x;
    160             max = _mm_max_ps( x, max );         // control the order here so that max is never NaN even if x is nan
    161 
    162             // It is too costly to keep the index of the max here. We will look for it again later.  We save a lot of work this way.
    163         }
    164 
    165         // If we found a new max
    166         if( 0xf != _mm_movemask_ps( (float4) _mm_cmpeq_ps(max, dotMax)))
    167         {
    168             // copy the new max across all lanes of our max accumulator
    169             max = _mm_max_ps(max, (float4) _mm_shuffle_ps( max, max, 0x4e));
    170             max = _mm_max_ps(max, (float4) _mm_shuffle_ps( max, max, 0xb1));
    171 
    172             dotMax = max;
    173 
    174             // find first occurrence of that max
    175             size_t test;
    176             for( index = 0; 0 == (test=_mm_movemask_ps( _mm_cmpeq_ps( stack_array[index], max))); index++ )   // local_count must be a multiple of 4
    177             {}
    178             // record where it is.
    179             maxIndex = 4*index + segment + indexTable[test];
    180         }
    181     }
    182 
    183     // account for work we've already done
    184     count -= segment;
    185 
    186     // Deal with the last < STACK_ARRAY_COUNT vectors
    187     max = dotMax;
    188     index = 0;
    189 
    190 
    191     if( btUnlikely( count > 16) )
    192     {
    193         for( ; index + 4 <= count / 4; index+=4 )
    194         { // do four dot products at a time. Carefully avoid touching the w element.
    195             float4 v0 = vertices[0];
    196             float4 v1 = vertices[1];
    197             float4 v2 = vertices[2];
    198             float4 v3 = vertices[3];            vertices += 4;
    199 
    200             float4 lo0 = _mm_movelh_ps( v0, v1);    // x0y0x1y1
    201             float4 hi0 = _mm_movehl_ps( v1, v0);    // z0?0z1?1
    202             float4 lo1 = _mm_movelh_ps( v2, v3);    // x2y2x3y3
    203             float4 hi1 = _mm_movehl_ps( v3, v2);    // z2?2z3?3
    204 
    205             lo0 = lo0*vLo;
    206             lo1 = lo1*vLo;
    207             float4 z = _mm_shuffle_ps(hi0, hi1, 0x88);
    208             float4 x = _mm_shuffle_ps(lo0, lo1, 0x88);
    209             float4 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
    210             z = z*vHi;
    211             x = x+y;
    212             x = x+z;
    213             stack_array[index] = x;
    214             max = _mm_max_ps( x, max );         // control the order here so that max is never NaN even if x is nan
    215 
    216             v0 = vertices[0];
    217             v1 = vertices[1];
    218             v2 = vertices[2];
    219             v3 = vertices[3];            vertices += 4;
    220 
    221             lo0 = _mm_movelh_ps( v0, v1);    // x0y0x1y1
    222             hi0 = _mm_movehl_ps( v1, v0);    // z0?0z1?1
    223             lo1 = _mm_movelh_ps( v2, v3);    // x2y2x3y3
    224             hi1 = _mm_movehl_ps( v3, v2);    // z2?2z3?3
    225 
    226             lo0 = lo0*vLo;
    227             lo1 = lo1*vLo;
    228             z = _mm_shuffle_ps(hi0, hi1, 0x88);
    229             x = _mm_shuffle_ps(lo0, lo1, 0x88);
    230             y = _mm_shuffle_ps(lo0, lo1, 0xdd);
    231             z = z*vHi;
    232             x = x+y;
    233             x = x+z;
    234             stack_array[index+1] = x;
    235             max = _mm_max_ps( x, max );         // control the order here so that max is never NaN even if x is nan
    236 
    237             v0 = vertices[0];
    238             v1 = vertices[1];
    239             v2 = vertices[2];
    240             v3 = vertices[3];            vertices += 4;
    241 
    242             lo0 = _mm_movelh_ps( v0, v1);    // x0y0x1y1
    243             hi0 = _mm_movehl_ps( v1, v0);    // z0?0z1?1
    244             lo1 = _mm_movelh_ps( v2, v3);    // x2y2x3y3
    245             hi1 = _mm_movehl_ps( v3, v2);    // z2?2z3?3
    246 
    247             lo0 = lo0*vLo;
    248             lo1 = lo1*vLo;
    249             z = _mm_shuffle_ps(hi0, hi1, 0x88);
    250             x = _mm_shuffle_ps(lo0, lo1, 0x88);
    251             y = _mm_shuffle_ps(lo0, lo1, 0xdd);
    252             z = z*vHi;
    253             x = x+y;
    254             x = x+z;
    255             stack_array[index+2] = x;
    256             max = _mm_max_ps( x, max );         // control the order here so that max is never NaN even if x is nan
    257 
    258             v0 = vertices[0];
    259             v1 = vertices[1];
    260             v2 = vertices[2];
    261             v3 = vertices[3];            vertices += 4;
    262 
    263             lo0 = _mm_movelh_ps( v0, v1);    // x0y0x1y1
    264             hi0 = _mm_movehl_ps( v1, v0);    // z0?0z1?1
    265             lo1 = _mm_movelh_ps( v2, v3);    // x2y2x3y3
    266             hi1 = _mm_movehl_ps( v3, v2);    // z2?2z3?3
    267 
    268             lo0 = lo0*vLo;
    269             lo1 = lo1*vLo;
    270             z = _mm_shuffle_ps(hi0, hi1, 0x88);
    271             x = _mm_shuffle_ps(lo0, lo1, 0x88);
    272             y = _mm_shuffle_ps(lo0, lo1, 0xdd);
    273             z = z*vHi;
    274             x = x+y;
    275             x = x+z;
    276             stack_array[index+3] = x;
    277             max = _mm_max_ps( x, max );         // control the order here so that max is never NaN even if x is nan
    278 
    279             // It is too costly to keep the index of the max here. We will look for it again later.  We save a lot of work this way.
    280         }
    281     }
    282 
    283     size_t localCount = (count & -4L) - 4*index;
    284     if( localCount )
    285     {
    286 #ifdef __APPLE__
    287         float4 t0, t1, t2, t3, t4;
    288         float4 * sap = &stack_array[index + localCount / 4];
    289           vertices += localCount;      // counter the offset
    290          size_t byteIndex = -(localCount) * sizeof(float);
    291         //AT&T Code style assembly
    292         asm volatile
    293         (   ".align 4                                                                   \n\
    294              0: movaps  %[max], %[t2]                            // move max out of the way to avoid propagating NaNs in max \n\
    295           movaps  (%[vertices], %[byteIndex], 4),    %[t0]    // vertices[0]      \n\
    296           movaps  16(%[vertices], %[byteIndex], 4),  %[t1]    // vertices[1]      \n\
    297           movaps  %[t0], %[max]                               // vertices[0]      \n\
    298           movlhps %[t1], %[max]                               // x0y0x1y1         \n\
    299          movaps  32(%[vertices], %[byteIndex], 4),  %[t3]    // vertices[2]      \n\
    300          movaps  48(%[vertices], %[byteIndex], 4),  %[t4]    // vertices[3]      \n\
    301           mulps   %[vLo], %[max]                              // x0y0x1y1 * vLo   \n\
    302          movhlps %[t0], %[t1]                                // z0w0z1w1         \n\
    303          movaps  %[t3], %[t0]                                // vertices[2]      \n\
    304          movlhps %[t4], %[t0]                                // x2y2x3y3         \n\
    305          mulps   %[vLo], %[t0]                               // x2y2x3y3 * vLo   \n\
    306           movhlps %[t3], %[t4]                                // z2w2z3w3         \n\
    307           shufps  $0x88, %[t4], %[t1]                         // z0z1z2z3         \n\
    308           mulps   %[vHi], %[t1]                               // z0z1z2z3 * vHi   \n\
    309          movaps  %[max], %[t3]                               // x0y0x1y1 * vLo   \n\
    310          shufps  $0x88, %[t0], %[max]                        // x0x1x2x3 * vLo.x \n\
    311          shufps  $0xdd, %[t0], %[t3]                         // y0y1y2y3 * vLo.y \n\
    312          addps   %[t3], %[max]                               // x + y            \n\
    313          addps   %[t1], %[max]                               // x + y + z        \n\
    314          movaps  %[max], (%[sap], %[byteIndex])              // record result for later scrutiny \n\
    315          maxps   %[t2], %[max]                               // record max, restore max   \n\
    316          add     $16, %[byteIndex]                           // advance loop counter\n\
    317          jnz     0b                                          \n\
    318      "
    319          : [max] "+x" (max), [t0] "=&x" (t0), [t1] "=&x" (t1), [t2] "=&x" (t2), [t3] "=&x" (t3), [t4] "=&x" (t4), [byteIndex] "+r" (byteIndex)
    320          : [vLo] "x" (vLo), [vHi] "x" (vHi), [vertices] "r" (vertices), [sap] "r" (sap)
    321          : "memory", "cc"
    322          );
    323         index += localCount/4;
    324 #else
    325         {
    326             for( unsigned int i=0; i<localCount/4; i++,index++)
    327             { // do four dot products at a time. Carefully avoid touching the w element.
    328                 float4 v0 = vertices[0];
    329                 float4 v1 = vertices[1];
    330                 float4 v2 = vertices[2];
    331                 float4 v3 = vertices[3];
    332                 vertices += 4;
    333 
    334                 float4 lo0 = _mm_movelh_ps( v0, v1);    // x0y0x1y1
    335                 float4 hi0 = _mm_movehl_ps( v1, v0);    // z0?0z1?1
    336                 float4 lo1 = _mm_movelh_ps( v2, v3);    // x2y2x3y3
    337                 float4 hi1 = _mm_movehl_ps( v3, v2);    // z2?2z3?3
    338 
    339                 lo0 = lo0*vLo;
    340                 lo1 = lo1*vLo;
    341                 float4 z = _mm_shuffle_ps(hi0, hi1, 0x88);
    342                 float4 x = _mm_shuffle_ps(lo0, lo1, 0x88);
    343                 float4 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
    344                 z = z*vHi;
    345                 x = x+y;
    346                 x = x+z;
    347                 stack_array[index] = x;
    348                 max = _mm_max_ps( x, max );         // control the order here so that max is never NaN even if x is nan
    349             }
    350         }
    351 #endif //__APPLE__
    352     }
    353 
    354     // process the last few points
    355     if( count & 3 )
    356     {
    357         float4 v0, v1, v2, x, y, z;
    358         switch( count & 3 )
    359         {
    360             case 3:
    361             {
    362                 v0 = vertices[0];
    363                 v1 = vertices[1];
    364                 v2 = vertices[2];
    365 
    366                 // Calculate 3 dot products, transpose, duplicate v2
    367                 float4 lo0 = _mm_movelh_ps( v0, v1);        // xyxy.lo
    368                 float4 hi0 = _mm_movehl_ps( v1, v0);        // z?z?.lo
    369                 lo0 = lo0*vLo;
    370                 z = _mm_shuffle_ps(hi0, v2,  0xa8 );           // z0z1z2z2
    371                 z = z*vHi;
    372                 float4 lo1 = _mm_movelh_ps(v2, v2);          // xyxy
    373                 lo1 = lo1*vLo;
    374                 x = _mm_shuffle_ps(lo0, lo1, 0x88);
    375                 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
    376             }
    377                 break;
    378             case 2:
    379             {
    380                 v0 = vertices[0];
    381                 v1 = vertices[1];
    382                 float4 xy = _mm_movelh_ps(v0, v1);
    383                 z = _mm_movehl_ps(v1, v0);
    384                 xy = xy*vLo;
    385                 z = _mm_shuffle_ps( z, z,  0xa8);
    386                 x = _mm_shuffle_ps( xy, xy, 0xa8);
    387                 y = _mm_shuffle_ps( xy, xy, 0xfd);
    388                 z = z*vHi;
    389             }
    390                 break;
    391             case 1:
    392             {
    393                 float4 xy = vertices[0];
    394                 z =  _mm_shuffle_ps( xy, xy, 0xaa);
    395                 xy = xy*vLo;
    396                 z = z*vHi;
    397                 x = _mm_shuffle_ps(xy, xy, 0);
    398                 y = _mm_shuffle_ps(xy, xy, 0x55);
    399             }
    400                 break;
    401         }
    402         x = x+y;
    403         x = x+z;
    404         stack_array[index] = x;
    405         max = _mm_max_ps( x, max );         // control the order here so that max is never NaN even if x is nan
    406         index++;
    407     }
    408 
    409     // if we found a new max.
    410     if( 0 == segment || 0xf != _mm_movemask_ps( (float4) _mm_cmpeq_ps(max, dotMax)))
    411     { // we found a new max. Search for it
    412       // find max across the max vector, place in all elements of max -- big latency hit here
    413         max = _mm_max_ps(max, (float4) _mm_shuffle_ps( max, max, 0x4e));
    414         max = _mm_max_ps(max, (float4) _mm_shuffle_ps( max, max, 0xb1));
    415 
    416         // It is slightly faster to do this part in scalar code when count < 8. However, the common case for
    417         // this where it actually makes a difference is handled in the early out at the top of the function,
    418         // so it is less than a 1% difference here. I opted for improved code size, fewer branches and reduced
    419         // complexity, and removed it.
    420 
    421         dotMax = max;
    422 
    423         // scan for the first occurence of max in the array
    424         size_t test;
    425         for( index = 0; 0 == (test=_mm_movemask_ps( _mm_cmpeq_ps( stack_array[index], max))); index++ )   // local_count must be a multiple of 4
    426         {}
    427         maxIndex = 4*index + segment + indexTable[test];
    428     }
    429 
    430     _mm_store_ss( dotResult, dotMax);
    431     return maxIndex;
    432 }
    433 
    434 long _mindot_large( const float *vv, const float *vec, unsigned long count, float *dotResult );
    435 
    436 long _mindot_large( const float *vv, const float *vec, unsigned long count, float *dotResult )
    437 {
    438     const float4 *vertices = (const float4*) vv;
    439     static const unsigned char indexTable[16] = {(unsigned char)-1, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0 };
    440     float4 dotmin = btAssign128( BT_INFINITY,  BT_INFINITY,  BT_INFINITY,  BT_INFINITY );
    441     float4 vvec = _mm_loadu_ps( vec );
    442     float4 vHi = btCastiTo128f(_mm_shuffle_epi32( btCastfTo128i( vvec), 0xaa ));          /// zzzz
    443     float4 vLo = _mm_movelh_ps( vvec, vvec );                               /// xyxy
    444 
    445     long minIndex = -1L;
    446 
    447     size_t segment = 0;
    448     float4 stack_array[ STACK_ARRAY_COUNT ];
    449 
    450 #if DEBUG
    451     //memset( stack_array, -1, STACK_ARRAY_COUNT * sizeof(stack_array[0]) );
    452 #endif
    453 
    454     size_t index;
    455     float4 min;
    456     // Faster loop without cleanup code for full tiles
    457     for ( segment = 0; segment + STACK_ARRAY_COUNT*4 <= count; segment += STACK_ARRAY_COUNT*4 )
    458     {
    459         min = dotmin;
    460 
    461         for( index = 0; index < STACK_ARRAY_COUNT; index+= 4 )
    462         { // do four dot products at a time. Carefully avoid touching the w element.
    463             float4 v0 = vertices[0];
    464             float4 v1 = vertices[1];
    465             float4 v2 = vertices[2];
    466             float4 v3 = vertices[3];            vertices += 4;
    467 
    468             float4 lo0 = _mm_movelh_ps( v0, v1);    // x0y0x1y1
    469             float4 hi0 = _mm_movehl_ps( v1, v0);    // z0?0z1?1
    470             float4 lo1 = _mm_movelh_ps( v2, v3);    // x2y2x3y3
    471             float4 hi1 = _mm_movehl_ps( v3, v2);    // z2?2z3?3
    472 
    473             lo0 = lo0*vLo;
    474             lo1 = lo1*vLo;
    475             float4 z = _mm_shuffle_ps(hi0, hi1, 0x88);
    476             float4 x = _mm_shuffle_ps(lo0, lo1, 0x88);
    477             float4 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
    478             z = z*vHi;
    479             x = x+y;
    480             x = x+z;
    481             stack_array[index] = x;
    482             min = _mm_min_ps( x, min );         // control the order here so that min is never NaN even if x is nan
    483 
    484             v0 = vertices[0];
    485             v1 = vertices[1];
    486             v2 = vertices[2];
    487             v3 = vertices[3];            vertices += 4;
    488 
    489             lo0 = _mm_movelh_ps( v0, v1);    // x0y0x1y1
    490             hi0 = _mm_movehl_ps( v1, v0);    // z0?0z1?1
    491             lo1 = _mm_movelh_ps( v2, v3);    // x2y2x3y3
    492             hi1 = _mm_movehl_ps( v3, v2);    // z2?2z3?3
    493 
    494             lo0 = lo0*vLo;
    495             lo1 = lo1*vLo;
    496             z = _mm_shuffle_ps(hi0, hi1, 0x88);
    497             x = _mm_shuffle_ps(lo0, lo1, 0x88);
    498             y = _mm_shuffle_ps(lo0, lo1, 0xdd);
    499             z = z*vHi;
    500             x = x+y;
    501             x = x+z;
    502             stack_array[index+1] = x;
    503             min = _mm_min_ps( x, min );         // control the order here so that min is never NaN even if x is nan
    504 
    505             v0 = vertices[0];
    506             v1 = vertices[1];
    507             v2 = vertices[2];
    508             v3 = vertices[3];            vertices += 4;
    509 
    510             lo0 = _mm_movelh_ps( v0, v1);    // x0y0x1y1
    511             hi0 = _mm_movehl_ps( v1, v0);    // z0?0z1?1
    512             lo1 = _mm_movelh_ps( v2, v3);    // x2y2x3y3
    513             hi1 = _mm_movehl_ps( v3, v2);    // z2?2z3?3
    514 
    515             lo0 = lo0*vLo;
    516             lo1 = lo1*vLo;
    517             z = _mm_shuffle_ps(hi0, hi1, 0x88);
    518             x = _mm_shuffle_ps(lo0, lo1, 0x88);
    519             y = _mm_shuffle_ps(lo0, lo1, 0xdd);
    520             z = z*vHi;
    521             x = x+y;
    522             x = x+z;
    523             stack_array[index+2] = x;
    524             min = _mm_min_ps( x, min );         // control the order here so that min is never NaN even if x is nan
    525 
    526             v0 = vertices[0];
    527             v1 = vertices[1];
    528             v2 = vertices[2];
    529             v3 = vertices[3];            vertices += 4;
    530 
    531             lo0 = _mm_movelh_ps( v0, v1);    // x0y0x1y1
    532             hi0 = _mm_movehl_ps( v1, v0);    // z0?0z1?1
    533             lo1 = _mm_movelh_ps( v2, v3);    // x2y2x3y3
    534             hi1 = _mm_movehl_ps( v3, v2);    // z2?2z3?3
    535 
    536             lo0 = lo0*vLo;
    537             lo1 = lo1*vLo;
    538             z = _mm_shuffle_ps(hi0, hi1, 0x88);
    539             x = _mm_shuffle_ps(lo0, lo1, 0x88);
    540             y = _mm_shuffle_ps(lo0, lo1, 0xdd);
    541             z = z*vHi;
    542             x = x+y;
    543             x = x+z;
    544             stack_array[index+3] = x;
    545             min = _mm_min_ps( x, min );         // control the order here so that min is never NaN even if x is nan
    546 
    547             // It is too costly to keep the index of the min here. We will look for it again later.  We save a lot of work this way.
    548         }
    549 
    550         // If we found a new min
    551         if( 0xf != _mm_movemask_ps( (float4) _mm_cmpeq_ps(min, dotmin)))
    552         {
    553             // copy the new min across all lanes of our min accumulator
    554             min = _mm_min_ps(min, (float4) _mm_shuffle_ps( min, min, 0x4e));
    555             min = _mm_min_ps(min, (float4) _mm_shuffle_ps( min, min, 0xb1));
    556 
    557             dotmin = min;
    558 
    559             // find first occurrence of that min
    560             size_t test;
    561             for( index = 0; 0 == (test=_mm_movemask_ps( _mm_cmpeq_ps( stack_array[index], min))); index++ )   // local_count must be a multiple of 4
    562             {}
    563             // record where it is.
    564             minIndex = 4*index + segment + indexTable[test];
    565         }
    566     }
    567 
    568     // account for work we've already done
    569     count -= segment;
    570 
    571     // Deal with the last < STACK_ARRAY_COUNT vectors
    572     min = dotmin;
    573     index = 0;
    574 
    575 
    576     if(btUnlikely( count > 16) )
    577     {
    578         for( ; index + 4 <= count / 4; index+=4 )
    579         { // do four dot products at a time. Carefully avoid touching the w element.
    580             float4 v0 = vertices[0];
    581             float4 v1 = vertices[1];
    582             float4 v2 = vertices[2];
    583             float4 v3 = vertices[3];            vertices += 4;
    584 
    585             float4 lo0 = _mm_movelh_ps( v0, v1);    // x0y0x1y1
    586             float4 hi0 = _mm_movehl_ps( v1, v0);    // z0?0z1?1
    587             float4 lo1 = _mm_movelh_ps( v2, v3);    // x2y2x3y3
    588             float4 hi1 = _mm_movehl_ps( v3, v2);    // z2?2z3?3
    589 
    590             lo0 = lo0*vLo;
    591             lo1 = lo1*vLo;
    592             float4 z = _mm_shuffle_ps(hi0, hi1, 0x88);
    593             float4 x = _mm_shuffle_ps(lo0, lo1, 0x88);
    594             float4 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
    595             z = z*vHi;
    596             x = x+y;
    597             x = x+z;
    598             stack_array[index] = x;
    599             min = _mm_min_ps( x, min );         // control the order here so that min is never NaN even if x is nan
    600 
    601             v0 = vertices[0];
    602             v1 = vertices[1];
    603             v2 = vertices[2];
    604             v3 = vertices[3];            vertices += 4;
    605 
    606             lo0 = _mm_movelh_ps( v0, v1);    // x0y0x1y1
    607             hi0 = _mm_movehl_ps( v1, v0);    // z0?0z1?1
    608             lo1 = _mm_movelh_ps( v2, v3);    // x2y2x3y3
    609             hi1 = _mm_movehl_ps( v3, v2);    // z2?2z3?3
    610 
    611             lo0 = lo0*vLo;
    612             lo1 = lo1*vLo;
    613             z = _mm_shuffle_ps(hi0, hi1, 0x88);
    614             x = _mm_shuffle_ps(lo0, lo1, 0x88);
    615             y = _mm_shuffle_ps(lo0, lo1, 0xdd);
    616             z = z*vHi;
    617             x = x+y;
    618             x = x+z;
    619             stack_array[index+1] = x;
    620             min = _mm_min_ps( x, min );         // control the order here so that min is never NaN even if x is nan
    621 
    622             v0 = vertices[0];
    623             v1 = vertices[1];
    624             v2 = vertices[2];
    625             v3 = vertices[3];            vertices += 4;
    626 
    627             lo0 = _mm_movelh_ps( v0, v1);    // x0y0x1y1
    628             hi0 = _mm_movehl_ps( v1, v0);    // z0?0z1?1
    629             lo1 = _mm_movelh_ps( v2, v3);    // x2y2x3y3
    630             hi1 = _mm_movehl_ps( v3, v2);    // z2?2z3?3
    631 
    632             lo0 = lo0*vLo;
    633             lo1 = lo1*vLo;
    634             z = _mm_shuffle_ps(hi0, hi1, 0x88);
    635             x = _mm_shuffle_ps(lo0, lo1, 0x88);
    636             y = _mm_shuffle_ps(lo0, lo1, 0xdd);
    637             z = z*vHi;
    638             x = x+y;
    639             x = x+z;
    640             stack_array[index+2] = x;
    641             min = _mm_min_ps( x, min );         // control the order here so that min is never NaN even if x is nan
    642 
    643             v0 = vertices[0];
    644             v1 = vertices[1];
    645             v2 = vertices[2];
    646             v3 = vertices[3];            vertices += 4;
    647 
    648             lo0 = _mm_movelh_ps( v0, v1);    // x0y0x1y1
    649             hi0 = _mm_movehl_ps( v1, v0);    // z0?0z1?1
    650             lo1 = _mm_movelh_ps( v2, v3);    // x2y2x3y3
    651             hi1 = _mm_movehl_ps( v3, v2);    // z2?2z3?3
    652 
    653             lo0 = lo0*vLo;
    654             lo1 = lo1*vLo;
    655             z = _mm_shuffle_ps(hi0, hi1, 0x88);
    656             x = _mm_shuffle_ps(lo0, lo1, 0x88);
    657             y = _mm_shuffle_ps(lo0, lo1, 0xdd);
    658             z = z*vHi;
    659             x = x+y;
    660             x = x+z;
    661             stack_array[index+3] = x;
    662             min = _mm_min_ps( x, min );         // control the order here so that min is never NaN even if x is nan
    663 
    664             // It is too costly to keep the index of the min here. We will look for it again later.  We save a lot of work this way.
    665         }
    666     }
    667 
    668     size_t localCount = (count & -4L) - 4*index;
    669     if( localCount )
    670     {
    671 
    672 
    673 #ifdef __APPLE__
    674         vertices += localCount;      // counter the offset
    675         float4 t0, t1, t2, t3, t4;
    676         size_t byteIndex = -(localCount) * sizeof(float);
    677         float4 * sap = &stack_array[index + localCount / 4];
    678 
    679         asm volatile
    680         (   ".align 4                                                                   \n\
    681              0: movaps  %[min], %[t2]                            // move min out of the way to avoid propagating NaNs in min \n\
    682              movaps  (%[vertices], %[byteIndex], 4),    %[t0]    // vertices[0]      \n\
    683              movaps  16(%[vertices], %[byteIndex], 4),  %[t1]    // vertices[1]      \n\
    684              movaps  %[t0], %[min]                               // vertices[0]      \n\
    685              movlhps %[t1], %[min]                               // x0y0x1y1         \n\
    686              movaps  32(%[vertices], %[byteIndex], 4),  %[t3]    // vertices[2]      \n\
    687              movaps  48(%[vertices], %[byteIndex], 4),  %[t4]    // vertices[3]      \n\
    688              mulps   %[vLo], %[min]                              // x0y0x1y1 * vLo   \n\
    689              movhlps %[t0], %[t1]                                // z0w0z1w1         \n\
    690              movaps  %[t3], %[t0]                                // vertices[2]      \n\
    691              movlhps %[t4], %[t0]                                // x2y2x3y3         \n\
    692              movhlps %[t3], %[t4]                                // z2w2z3w3         \n\
    693              mulps   %[vLo], %[t0]                               // x2y2x3y3 * vLo   \n\
    694              shufps  $0x88, %[t4], %[t1]                         // z0z1z2z3         \n\
    695              mulps   %[vHi], %[t1]                               // z0z1z2z3 * vHi   \n\
    696              movaps  %[min], %[t3]                               // x0y0x1y1 * vLo   \n\
    697              shufps  $0x88, %[t0], %[min]                        // x0x1x2x3 * vLo.x \n\
    698              shufps  $0xdd, %[t0], %[t3]                         // y0y1y2y3 * vLo.y \n\
    699              addps   %[t3], %[min]                               // x + y            \n\
    700              addps   %[t1], %[min]                               // x + y + z        \n\
    701              movaps  %[min], (%[sap], %[byteIndex])              // record result for later scrutiny \n\
    702              minps   %[t2], %[min]                               // record min, restore min   \n\
    703              add     $16, %[byteIndex]                           // advance loop counter\n\
    704              jnz     0b                                          \n\
    705              "
    706          : [min] "+x" (min), [t0] "=&x" (t0), [t1] "=&x" (t1), [t2] "=&x" (t2), [t3] "=&x" (t3), [t4] "=&x" (t4), [byteIndex] "+r" (byteIndex)
    707          : [vLo] "x" (vLo), [vHi] "x" (vHi), [vertices] "r" (vertices), [sap] "r" (sap)
    708          : "memory", "cc"
    709          );
    710         index += localCount/4;
    711 #else
    712         {
    713             for( unsigned int i=0; i<localCount/4; i++,index++)
    714             { // do four dot products at a time. Carefully avoid touching the w element.
    715                 float4 v0 = vertices[0];
    716                 float4 v1 = vertices[1];
    717                 float4 v2 = vertices[2];
    718                 float4 v3 = vertices[3];
    719                 vertices += 4;
    720 
    721                 float4 lo0 = _mm_movelh_ps( v0, v1);    // x0y0x1y1
    722                 float4 hi0 = _mm_movehl_ps( v1, v0);    // z0?0z1?1
    723                 float4 lo1 = _mm_movelh_ps( v2, v3);    // x2y2x3y3
    724                 float4 hi1 = _mm_movehl_ps( v3, v2);    // z2?2z3?3
    725 
    726                 lo0 = lo0*vLo;
    727                 lo1 = lo1*vLo;
    728                 float4 z = _mm_shuffle_ps(hi0, hi1, 0x88);
    729                 float4 x = _mm_shuffle_ps(lo0, lo1, 0x88);
    730                 float4 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
    731                 z = z*vHi;
    732                 x = x+y;
    733                 x = x+z;
    734                 stack_array[index] = x;
    735                 min = _mm_min_ps( x, min );         // control the order here so that max is never NaN even if x is nan
    736             }
    737         }
    738 
    739 #endif
    740     }
    741 
    742     // process the last few points
    743     if( count & 3 )
    744     {
    745         float4 v0, v1, v2, x, y, z;
    746         switch( count & 3 )
    747         {
    748             case 3:
    749             {
    750                 v0 = vertices[0];
    751                 v1 = vertices[1];
    752                 v2 = vertices[2];
    753 
    754                 // Calculate 3 dot products, transpose, duplicate v2
    755                 float4 lo0 = _mm_movelh_ps( v0, v1);        // xyxy.lo
    756                 float4 hi0 = _mm_movehl_ps( v1, v0);        // z?z?.lo
    757                 lo0 = lo0*vLo;
    758                 z = _mm_shuffle_ps(hi0, v2,  0xa8 );           // z0z1z2z2
    759                 z = z*vHi;
    760                 float4 lo1 = _mm_movelh_ps(v2, v2);          // xyxy
    761                 lo1 = lo1*vLo;
    762                 x = _mm_shuffle_ps(lo0, lo1, 0x88);
    763                 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
    764             }
    765                 break;
    766             case 2:
    767             {
    768                 v0 = vertices[0];
    769                 v1 = vertices[1];
    770                 float4 xy = _mm_movelh_ps(v0, v1);
    771                 z = _mm_movehl_ps(v1, v0);
    772                 xy = xy*vLo;
    773                 z = _mm_shuffle_ps( z, z,  0xa8);
    774                 x = _mm_shuffle_ps( xy, xy, 0xa8);
    775                 y = _mm_shuffle_ps( xy, xy, 0xfd);
    776                 z = z*vHi;
    777             }
    778                 break;
    779             case 1:
    780             {
    781                 float4 xy = vertices[0];
    782                 z =  _mm_shuffle_ps( xy, xy, 0xaa);
    783                 xy = xy*vLo;
    784                 z = z*vHi;
    785                 x = _mm_shuffle_ps(xy, xy, 0);
    786                 y = _mm_shuffle_ps(xy, xy, 0x55);
    787             }
    788                 break;
    789         }
    790         x = x+y;
    791         x = x+z;
    792         stack_array[index] = x;
    793         min = _mm_min_ps( x, min );         // control the order here so that min is never NaN even if x is nan
    794         index++;
    795     }
    796 
    797     // if we found a new min.
    798     if( 0 == segment || 0xf != _mm_movemask_ps( (float4) _mm_cmpeq_ps(min, dotmin)))
    799     { // we found a new min. Search for it
    800       // find min across the min vector, place in all elements of min -- big latency hit here
    801         min = _mm_min_ps(min, (float4) _mm_shuffle_ps( min, min, 0x4e));
    802         min = _mm_min_ps(min, (float4) _mm_shuffle_ps( min, min, 0xb1));
    803 
    804         // It is slightly faster to do this part in scalar code when count < 8. However, the common case for
    805         // this where it actually makes a difference is handled in the early out at the top of the function,
    806         // so it is less than a 1% difference here. I opted for improved code size, fewer branches and reduced
    807         // complexity, and removed it.
    808 
    809         dotmin = min;
    810 
    811         // scan for the first occurence of min in the array
    812         size_t test;
    813         for( index = 0; 0 == (test=_mm_movemask_ps( _mm_cmpeq_ps( stack_array[index], min))); index++ )   // local_count must be a multiple of 4
    814         {}
    815         minIndex = 4*index + segment + indexTable[test];
    816     }
    817 
    818     _mm_store_ss( dotResult, dotmin);
    819     return minIndex;
    820 }
    821 
    822 
    823 #elif defined BT_USE_NEON
    824 
    825 #define ARM_NEON_GCC_COMPATIBILITY  1
    826 #include <arm_neon.h>
    827 #include <sys/types.h>
    828 #include <sys/sysctl.h> //for sysctlbyname
    829 
    830 static long _maxdot_large_v0( const float *vv, const float *vec, unsigned long count, float *dotResult );
    831 static long _maxdot_large_v1( const float *vv, const float *vec, unsigned long count, float *dotResult );
    832 static long _maxdot_large_sel( const float *vv, const float *vec, unsigned long count, float *dotResult );
    833 static long _mindot_large_v0( const float *vv, const float *vec, unsigned long count, float *dotResult );
    834 static long _mindot_large_v1( const float *vv, const float *vec, unsigned long count, float *dotResult );
    835 static long _mindot_large_sel( const float *vv, const float *vec, unsigned long count, float *dotResult );
    836 
    837 long (*_maxdot_large)( const float *vv, const float *vec, unsigned long count, float *dotResult ) = _maxdot_large_sel;
    838 long (*_mindot_large)( const float *vv, const float *vec, unsigned long count, float *dotResult ) = _mindot_large_sel;
    839 
    840 
    841 static inline uint32_t btGetCpuCapabilities( void )
    842 {
    843     static uint32_t capabilities = 0;
    844     static bool testedCapabilities = false;
    845 
    846     if( 0 == testedCapabilities)
    847     {
    848         uint32_t hasFeature = 0;
    849         size_t featureSize = sizeof( hasFeature );
    850         int err = sysctlbyname( "hw.optional.neon_hpfp", &hasFeature, &featureSize, NULL, 0 );
    851 
    852         if( 0 == err && hasFeature)
    853             capabilities |= 0x2000;
    854 
    855 		testedCapabilities = true;
    856     }
    857 
    858     return capabilities;
    859 }
    860 
    861 
    862 
    863 
    864 static long _maxdot_large_sel( const float *vv, const float *vec, unsigned long count, float *dotResult )
    865 {
    866 
    867     if( btGetCpuCapabilities() & 0x2000 )
    868         _maxdot_large = _maxdot_large_v1;
    869     else
    870         _maxdot_large = _maxdot_large_v0;
    871 
    872     return _maxdot_large(vv, vec, count, dotResult);
    873 }
    874 
    875 static long _mindot_large_sel( const float *vv, const float *vec, unsigned long count, float *dotResult )
    876 {
    877 
    878     if( btGetCpuCapabilities() & 0x2000 )
    879         _mindot_large = _mindot_large_v1;
    880     else
    881         _mindot_large = _mindot_large_v0;
    882 
    883     return _mindot_large(vv, vec, count, dotResult);
    884 }
    885 
    886 
    887 
    888 #if defined __arm__
    889 # define vld1q_f32_aligned_postincrement( _ptr ) ({ float32x4_t _r; asm( "vld1.f32 {%0}, [%1, :128]!\n" : "=w" (_r), "+r" (_ptr) ); /*return*/ _r; })
    890 #else
    891 //support 64bit arm
    892 # define vld1q_f32_aligned_postincrement( _ptr) ({ float32x4_t _r = ((float32x4_t*)(_ptr))[0]; (_ptr) = (const float*) ((const char*)(_ptr) + 16L); /*return*/ _r; })
    893 #endif
    894 
    895 
    896 long _maxdot_large_v0( const float *vv, const float *vec, unsigned long count, float *dotResult )
    897 {
    898     unsigned long i = 0;
    899     float32x4_t vvec = vld1q_f32_aligned_postincrement( vec );
    900     float32x2_t vLo = vget_low_f32(vvec);
    901     float32x2_t vHi = vdup_lane_f32(vget_high_f32(vvec), 0);
    902     float32x2_t dotMaxLo = (float32x2_t) { -BT_INFINITY, -BT_INFINITY };
    903     float32x2_t dotMaxHi = (float32x2_t) { -BT_INFINITY, -BT_INFINITY };
    904     uint32x2_t indexLo = (uint32x2_t) {0, 1};
    905     uint32x2_t indexHi = (uint32x2_t) {2, 3};
    906     uint32x2_t iLo = (uint32x2_t) {static_cast<uint32_t>(-1), static_cast<uint32_t>(-1)};
    907     uint32x2_t iHi = (uint32x2_t) {static_cast<uint32_t>(-1), static_cast<uint32_t>(-1)};
    908     const uint32x2_t four = (uint32x2_t) {4,4};
    909 
    910     for( ; i+8 <= count; i+= 8 )
    911     {
    912         float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
    913         float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
    914         float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
    915         float32x4_t v3 = vld1q_f32_aligned_postincrement( vv );
    916 
    917         float32x2_t xy0 = vmul_f32( vget_low_f32(v0), vLo);
    918         float32x2_t xy1 = vmul_f32( vget_low_f32(v1), vLo);
    919         float32x2_t xy2 = vmul_f32( vget_low_f32(v2), vLo);
    920         float32x2_t xy3 = vmul_f32( vget_low_f32(v3), vLo);
    921 
    922         float32x2x2_t z0 = vtrn_f32( vget_high_f32(v0), vget_high_f32(v1));
    923         float32x2x2_t z1 = vtrn_f32( vget_high_f32(v2), vget_high_f32(v3));
    924         float32x2_t zLo = vmul_f32( z0.val[0], vHi);
    925         float32x2_t zHi = vmul_f32( z1.val[0], vHi);
    926 
    927         float32x2_t rLo = vpadd_f32( xy0, xy1);
    928         float32x2_t rHi = vpadd_f32( xy2, xy3);
    929         rLo = vadd_f32(rLo, zLo);
    930         rHi = vadd_f32(rHi, zHi);
    931 
    932         uint32x2_t maskLo = vcgt_f32( rLo, dotMaxLo );
    933         uint32x2_t maskHi = vcgt_f32( rHi, dotMaxHi );
    934         dotMaxLo = vbsl_f32( maskLo, rLo, dotMaxLo);
    935         dotMaxHi = vbsl_f32( maskHi, rHi, dotMaxHi);
    936         iLo = vbsl_u32(maskLo, indexLo, iLo);
    937         iHi = vbsl_u32(maskHi, indexHi, iHi);
    938         indexLo = vadd_u32(indexLo, four);
    939         indexHi = vadd_u32(indexHi, four);
    940 
    941         v0 = vld1q_f32_aligned_postincrement( vv );
    942         v1 = vld1q_f32_aligned_postincrement( vv );
    943         v2 = vld1q_f32_aligned_postincrement( vv );
    944         v3 = vld1q_f32_aligned_postincrement( vv );
    945 
    946         xy0 = vmul_f32( vget_low_f32(v0), vLo);
    947         xy1 = vmul_f32( vget_low_f32(v1), vLo);
    948         xy2 = vmul_f32( vget_low_f32(v2), vLo);
    949         xy3 = vmul_f32( vget_low_f32(v3), vLo);
    950 
    951         z0 = vtrn_f32( vget_high_f32(v0), vget_high_f32(v1));
    952         z1 = vtrn_f32( vget_high_f32(v2), vget_high_f32(v3));
    953         zLo = vmul_f32( z0.val[0], vHi);
    954         zHi = vmul_f32( z1.val[0], vHi);
    955 
    956         rLo = vpadd_f32( xy0, xy1);
    957         rHi = vpadd_f32( xy2, xy3);
    958         rLo = vadd_f32(rLo, zLo);
    959         rHi = vadd_f32(rHi, zHi);
    960 
    961         maskLo = vcgt_f32( rLo, dotMaxLo );
    962         maskHi = vcgt_f32( rHi, dotMaxHi );
    963         dotMaxLo = vbsl_f32( maskLo, rLo, dotMaxLo);
    964         dotMaxHi = vbsl_f32( maskHi, rHi, dotMaxHi);
    965         iLo = vbsl_u32(maskLo, indexLo, iLo);
    966         iHi = vbsl_u32(maskHi, indexHi, iHi);
    967         indexLo = vadd_u32(indexLo, four);
    968         indexHi = vadd_u32(indexHi, four);
    969     }
    970 
    971     for( ; i+4 <= count; i+= 4 )
    972     {
    973         float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
    974         float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
    975         float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
    976         float32x4_t v3 = vld1q_f32_aligned_postincrement( vv );
    977 
    978         float32x2_t xy0 = vmul_f32( vget_low_f32(v0), vLo);
    979         float32x2_t xy1 = vmul_f32( vget_low_f32(v1), vLo);
    980         float32x2_t xy2 = vmul_f32( vget_low_f32(v2), vLo);
    981         float32x2_t xy3 = vmul_f32( vget_low_f32(v3), vLo);
    982 
    983         float32x2x2_t z0 = vtrn_f32( vget_high_f32(v0), vget_high_f32(v1));
    984         float32x2x2_t z1 = vtrn_f32( vget_high_f32(v2), vget_high_f32(v3));
    985         float32x2_t zLo = vmul_f32( z0.val[0], vHi);
    986         float32x2_t zHi = vmul_f32( z1.val[0], vHi);
    987 
    988         float32x2_t rLo = vpadd_f32( xy0, xy1);
    989         float32x2_t rHi = vpadd_f32( xy2, xy3);
    990         rLo = vadd_f32(rLo, zLo);
    991         rHi = vadd_f32(rHi, zHi);
    992 
    993         uint32x2_t maskLo = vcgt_f32( rLo, dotMaxLo );
    994         uint32x2_t maskHi = vcgt_f32( rHi, dotMaxHi );
    995         dotMaxLo = vbsl_f32( maskLo, rLo, dotMaxLo);
    996         dotMaxHi = vbsl_f32( maskHi, rHi, dotMaxHi);
    997         iLo = vbsl_u32(maskLo, indexLo, iLo);
    998         iHi = vbsl_u32(maskHi, indexHi, iHi);
    999         indexLo = vadd_u32(indexLo, four);
   1000         indexHi = vadd_u32(indexHi, four);
   1001     }
   1002 
   1003     switch( count & 3 )
   1004     {
   1005         case 3:
   1006         {
   1007             float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
   1008             float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
   1009             float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
   1010 
   1011             float32x2_t xy0 = vmul_f32( vget_low_f32(v0), vLo);
   1012             float32x2_t xy1 = vmul_f32( vget_low_f32(v1), vLo);
   1013             float32x2_t xy2 = vmul_f32( vget_low_f32(v2), vLo);
   1014 
   1015             float32x2x2_t z0 = vtrn_f32( vget_high_f32(v0), vget_high_f32(v1));
   1016             float32x2_t zLo = vmul_f32( z0.val[0], vHi);
   1017             float32x2_t zHi = vmul_f32( vdup_lane_f32(vget_high_f32(v2), 0), vHi);
   1018 
   1019             float32x2_t rLo = vpadd_f32( xy0, xy1);
   1020             float32x2_t rHi = vpadd_f32( xy2, xy2);
   1021             rLo = vadd_f32(rLo, zLo);
   1022             rHi = vadd_f32(rHi, zHi);
   1023 
   1024             uint32x2_t maskLo = vcgt_f32( rLo, dotMaxLo );
   1025             uint32x2_t maskHi = vcgt_f32( rHi, dotMaxHi );
   1026             dotMaxLo = vbsl_f32( maskLo, rLo, dotMaxLo);
   1027             dotMaxHi = vbsl_f32( maskHi, rHi, dotMaxHi);
   1028             iLo = vbsl_u32(maskLo, indexLo, iLo);
   1029             iHi = vbsl_u32(maskHi, indexHi, iHi);
   1030         }
   1031             break;
   1032         case 2:
   1033         {
   1034             float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
   1035             float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
   1036 
   1037             float32x2_t xy0 = vmul_f32( vget_low_f32(v0), vLo);
   1038             float32x2_t xy1 = vmul_f32( vget_low_f32(v1), vLo);
   1039 
   1040             float32x2x2_t z0 = vtrn_f32( vget_high_f32(v0), vget_high_f32(v1));
   1041             float32x2_t zLo = vmul_f32( z0.val[0], vHi);
   1042 
   1043             float32x2_t rLo = vpadd_f32( xy0, xy1);
   1044             rLo = vadd_f32(rLo, zLo);
   1045 
   1046             uint32x2_t maskLo = vcgt_f32( rLo, dotMaxLo );
   1047             dotMaxLo = vbsl_f32( maskLo, rLo, dotMaxLo);
   1048             iLo = vbsl_u32(maskLo, indexLo, iLo);
   1049         }
   1050             break;
   1051         case 1:
   1052         {
   1053             float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
   1054             float32x2_t xy0 = vmul_f32( vget_low_f32(v0), vLo);
   1055             float32x2_t z0 = vdup_lane_f32(vget_high_f32(v0), 0);
   1056             float32x2_t zLo = vmul_f32( z0, vHi);
   1057             float32x2_t rLo = vpadd_f32( xy0, xy0);
   1058             rLo = vadd_f32(rLo, zLo);
   1059             uint32x2_t maskLo = vcgt_f32( rLo, dotMaxLo );
   1060             dotMaxLo = vbsl_f32( maskLo, rLo, dotMaxLo);
   1061             iLo = vbsl_u32(maskLo, indexLo, iLo);
   1062         }
   1063             break;
   1064 
   1065         default:
   1066             break;
   1067     }
   1068 
   1069     // select best answer between hi and lo results
   1070     uint32x2_t mask = vcgt_f32( dotMaxHi, dotMaxLo );
   1071     dotMaxLo = vbsl_f32(mask, dotMaxHi, dotMaxLo);
   1072     iLo = vbsl_u32(mask, iHi, iLo);
   1073 
   1074     // select best answer between even and odd results
   1075     dotMaxHi = vdup_lane_f32(dotMaxLo, 1);
   1076     iHi = vdup_lane_u32(iLo, 1);
   1077     mask = vcgt_f32( dotMaxHi, dotMaxLo );
   1078     dotMaxLo = vbsl_f32(mask, dotMaxHi, dotMaxLo);
   1079     iLo = vbsl_u32(mask, iHi, iLo);
   1080 
   1081     *dotResult = vget_lane_f32( dotMaxLo, 0);
   1082     return vget_lane_u32(iLo, 0);
   1083 }
   1084 
   1085 
   1086 long _maxdot_large_v1( const float *vv, const float *vec, unsigned long count, float *dotResult )
   1087 {
   1088     float32x4_t vvec = vld1q_f32_aligned_postincrement( vec );
   1089     float32x4_t vLo = vcombine_f32(vget_low_f32(vvec), vget_low_f32(vvec));
   1090     float32x4_t vHi = vdupq_lane_f32(vget_high_f32(vvec), 0);
   1091     const uint32x4_t four = (uint32x4_t){ 4, 4, 4, 4 };
   1092     uint32x4_t local_index = (uint32x4_t) {0, 1, 2, 3};
   1093     uint32x4_t index = (uint32x4_t) { static_cast<uint32_t>(-1), static_cast<uint32_t>(-1), static_cast<uint32_t>(-1), static_cast<uint32_t>(-1) };
   1094     float32x4_t maxDot = (float32x4_t) { -BT_INFINITY, -BT_INFINITY, -BT_INFINITY, -BT_INFINITY };
   1095 
   1096     unsigned long i = 0;
   1097     for( ; i + 8 <= count; i += 8 )
   1098     {
   1099         float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
   1100         float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
   1101         float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
   1102         float32x4_t v3 = vld1q_f32_aligned_postincrement( vv );
   1103 
   1104         // the next two lines should resolve to a single vswp d, d
   1105         float32x4_t xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v1));
   1106         float32x4_t xy1 = vcombine_f32( vget_low_f32(v2), vget_low_f32(v3));
   1107         // the next two lines should resolve to a single vswp d, d
   1108         float32x4_t z0 = vcombine_f32( vget_high_f32(v0), vget_high_f32(v1));
   1109         float32x4_t z1 = vcombine_f32( vget_high_f32(v2), vget_high_f32(v3));
   1110 
   1111         xy0 = vmulq_f32(xy0, vLo);
   1112         xy1 = vmulq_f32(xy1, vLo);
   1113 
   1114         float32x4x2_t zb = vuzpq_f32( z0, z1);
   1115         float32x4_t z = vmulq_f32( zb.val[0], vHi);
   1116         float32x4x2_t xy = vuzpq_f32( xy0, xy1);
   1117         float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
   1118         x = vaddq_f32(x, z);
   1119 
   1120         uint32x4_t mask = vcgtq_f32(x, maxDot);
   1121         maxDot = vbslq_f32( mask, x, maxDot);
   1122         index = vbslq_u32(mask, local_index, index);
   1123         local_index = vaddq_u32(local_index, four);
   1124 
   1125         v0 = vld1q_f32_aligned_postincrement( vv );
   1126         v1 = vld1q_f32_aligned_postincrement( vv );
   1127         v2 = vld1q_f32_aligned_postincrement( vv );
   1128         v3 = vld1q_f32_aligned_postincrement( vv );
   1129 
   1130         // the next two lines should resolve to a single vswp d, d
   1131         xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v1));
   1132         xy1 = vcombine_f32( vget_low_f32(v2), vget_low_f32(v3));
   1133         // the next two lines should resolve to a single vswp d, d
   1134         z0 = vcombine_f32( vget_high_f32(v0), vget_high_f32(v1));
   1135         z1 = vcombine_f32( vget_high_f32(v2), vget_high_f32(v3));
   1136 
   1137         xy0 = vmulq_f32(xy0, vLo);
   1138         xy1 = vmulq_f32(xy1, vLo);
   1139 
   1140         zb = vuzpq_f32( z0, z1);
   1141         z = vmulq_f32( zb.val[0], vHi);
   1142         xy = vuzpq_f32( xy0, xy1);
   1143         x = vaddq_f32(xy.val[0], xy.val[1]);
   1144         x = vaddq_f32(x, z);
   1145 
   1146         mask = vcgtq_f32(x, maxDot);
   1147         maxDot = vbslq_f32( mask, x, maxDot);
   1148         index = vbslq_u32(mask, local_index, index);
   1149         local_index = vaddq_u32(local_index, four);
   1150     }
   1151 
   1152     for( ; i + 4 <= count; i += 4 )
   1153     {
   1154         float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
   1155         float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
   1156         float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
   1157         float32x4_t v3 = vld1q_f32_aligned_postincrement( vv );
   1158 
   1159         // the next two lines should resolve to a single vswp d, d
   1160         float32x4_t xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v1));
   1161         float32x4_t xy1 = vcombine_f32( vget_low_f32(v2), vget_low_f32(v3));
   1162         // the next two lines should resolve to a single vswp d, d
   1163         float32x4_t z0 = vcombine_f32( vget_high_f32(v0), vget_high_f32(v1));
   1164         float32x4_t z1 = vcombine_f32( vget_high_f32(v2), vget_high_f32(v3));
   1165 
   1166         xy0 = vmulq_f32(xy0, vLo);
   1167         xy1 = vmulq_f32(xy1, vLo);
   1168 
   1169         float32x4x2_t zb = vuzpq_f32( z0, z1);
   1170         float32x4_t z = vmulq_f32( zb.val[0], vHi);
   1171         float32x4x2_t xy = vuzpq_f32( xy0, xy1);
   1172         float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
   1173         x = vaddq_f32(x, z);
   1174 
   1175         uint32x4_t mask = vcgtq_f32(x, maxDot);
   1176         maxDot = vbslq_f32( mask, x, maxDot);
   1177         index = vbslq_u32(mask, local_index, index);
   1178         local_index = vaddq_u32(local_index, four);
   1179     }
   1180 
   1181     switch (count & 3) {
   1182         case 3:
   1183         {
   1184             float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
   1185             float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
   1186             float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
   1187 
   1188             // the next two lines should resolve to a single vswp d, d
   1189             float32x4_t xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v1));
   1190             float32x4_t xy1 = vcombine_f32( vget_low_f32(v2), vget_low_f32(v2));
   1191             // the next two lines should resolve to a single vswp d, d
   1192             float32x4_t z0 = vcombine_f32( vget_high_f32(v0), vget_high_f32(v1));
   1193             float32x4_t z1 = vcombine_f32( vget_high_f32(v2), vget_high_f32(v2));
   1194 
   1195             xy0 = vmulq_f32(xy0, vLo);
   1196             xy1 = vmulq_f32(xy1, vLo);
   1197 
   1198             float32x4x2_t zb = vuzpq_f32( z0, z1);
   1199             float32x4_t z = vmulq_f32( zb.val[0], vHi);
   1200             float32x4x2_t xy = vuzpq_f32( xy0, xy1);
   1201             float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
   1202             x = vaddq_f32(x, z);
   1203 
   1204             uint32x4_t mask = vcgtq_f32(x, maxDot);
   1205             maxDot = vbslq_f32( mask, x, maxDot);
   1206             index = vbslq_u32(mask, local_index, index);
   1207             local_index = vaddq_u32(local_index, four);
   1208         }
   1209             break;
   1210 
   1211         case 2:
   1212         {
   1213             float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
   1214             float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
   1215 
   1216             // the next two lines should resolve to a single vswp d, d
   1217             float32x4_t xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v1));
   1218             // the next two lines should resolve to a single vswp d, d
   1219             float32x4_t z0 = vcombine_f32( vget_high_f32(v0), vget_high_f32(v1));
   1220 
   1221             xy0 = vmulq_f32(xy0, vLo);
   1222 
   1223             float32x4x2_t zb = vuzpq_f32( z0, z0);
   1224             float32x4_t z = vmulq_f32( zb.val[0], vHi);
   1225             float32x4x2_t xy = vuzpq_f32( xy0, xy0);
   1226             float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
   1227             x = vaddq_f32(x, z);
   1228 
   1229             uint32x4_t mask = vcgtq_f32(x, maxDot);
   1230             maxDot = vbslq_f32( mask, x, maxDot);
   1231             index = vbslq_u32(mask, local_index, index);
   1232             local_index = vaddq_u32(local_index, four);
   1233         }
   1234             break;
   1235 
   1236         case 1:
   1237         {
   1238             float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
   1239 
   1240             // the next two lines should resolve to a single vswp d, d
   1241             float32x4_t xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v0));
   1242             // the next two lines should resolve to a single vswp d, d
   1243             float32x4_t z = vdupq_lane_f32(vget_high_f32(v0), 0);
   1244 
   1245             xy0 = vmulq_f32(xy0, vLo);
   1246 
   1247             z = vmulq_f32( z, vHi);
   1248             float32x4x2_t xy = vuzpq_f32( xy0, xy0);
   1249             float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
   1250             x = vaddq_f32(x, z);
   1251 
   1252             uint32x4_t mask = vcgtq_f32(x, maxDot);
   1253             maxDot = vbslq_f32( mask, x, maxDot);
   1254             index = vbslq_u32(mask, local_index, index);
   1255             local_index = vaddq_u32(local_index, four);
   1256         }
   1257             break;
   1258 
   1259         default:
   1260             break;
   1261     }
   1262 
   1263 
   1264     // select best answer between hi and lo results
   1265     uint32x2_t mask = vcgt_f32( vget_high_f32(maxDot), vget_low_f32(maxDot));
   1266     float32x2_t maxDot2 = vbsl_f32(mask, vget_high_f32(maxDot), vget_low_f32(maxDot));
   1267     uint32x2_t index2 = vbsl_u32(mask, vget_high_u32(index), vget_low_u32(index));
   1268 
   1269     // select best answer between even and odd results
   1270     float32x2_t maxDotO = vdup_lane_f32(maxDot2, 1);
   1271     uint32x2_t indexHi = vdup_lane_u32(index2, 1);
   1272     mask = vcgt_f32( maxDotO, maxDot2 );
   1273     maxDot2 = vbsl_f32(mask, maxDotO, maxDot2);
   1274     index2 = vbsl_u32(mask, indexHi, index2);
   1275 
   1276     *dotResult = vget_lane_f32( maxDot2, 0);
   1277     return vget_lane_u32(index2, 0);
   1278 
   1279 }
   1280 
   1281 long _mindot_large_v0( const float *vv, const float *vec, unsigned long count, float *dotResult )
   1282 {
   1283     unsigned long i = 0;
   1284     float32x4_t vvec = vld1q_f32_aligned_postincrement( vec );
   1285     float32x2_t vLo = vget_low_f32(vvec);
   1286     float32x2_t vHi = vdup_lane_f32(vget_high_f32(vvec), 0);
   1287     float32x2_t dotMinLo = (float32x2_t) { BT_INFINITY, BT_INFINITY };
   1288     float32x2_t dotMinHi = (float32x2_t) { BT_INFINITY, BT_INFINITY };
   1289     uint32x2_t indexLo = (uint32x2_t) {0, 1};
   1290     uint32x2_t indexHi = (uint32x2_t) {2, 3};
   1291     uint32x2_t iLo = (uint32x2_t) {static_cast<uint32_t>(-1), static_cast<uint32_t>(-1)};
   1292     uint32x2_t iHi = (uint32x2_t) {static_cast<uint32_t>(-1), static_cast<uint32_t>(-1)};
   1293     const uint32x2_t four = (uint32x2_t) {4,4};
   1294 
   1295     for( ; i+8 <= count; i+= 8 )
   1296     {
   1297         float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
   1298         float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
   1299         float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
   1300         float32x4_t v3 = vld1q_f32_aligned_postincrement( vv );
   1301 
   1302         float32x2_t xy0 = vmul_f32( vget_low_f32(v0), vLo);
   1303         float32x2_t xy1 = vmul_f32( vget_low_f32(v1), vLo);
   1304         float32x2_t xy2 = vmul_f32( vget_low_f32(v2), vLo);
   1305         float32x2_t xy3 = vmul_f32( vget_low_f32(v3), vLo);
   1306 
   1307         float32x2x2_t z0 = vtrn_f32( vget_high_f32(v0), vget_high_f32(v1));
   1308         float32x2x2_t z1 = vtrn_f32( vget_high_f32(v2), vget_high_f32(v3));
   1309         float32x2_t zLo = vmul_f32( z0.val[0], vHi);
   1310         float32x2_t zHi = vmul_f32( z1.val[0], vHi);
   1311 
   1312         float32x2_t rLo = vpadd_f32( xy0, xy1);
   1313         float32x2_t rHi = vpadd_f32( xy2, xy3);
   1314         rLo = vadd_f32(rLo, zLo);
   1315         rHi = vadd_f32(rHi, zHi);
   1316 
   1317         uint32x2_t maskLo = vclt_f32( rLo, dotMinLo );
   1318         uint32x2_t maskHi = vclt_f32( rHi, dotMinHi );
   1319         dotMinLo = vbsl_f32( maskLo, rLo, dotMinLo);
   1320         dotMinHi = vbsl_f32( maskHi, rHi, dotMinHi);
   1321         iLo = vbsl_u32(maskLo, indexLo, iLo);
   1322         iHi = vbsl_u32(maskHi, indexHi, iHi);
   1323         indexLo = vadd_u32(indexLo, four);
   1324         indexHi = vadd_u32(indexHi, four);
   1325 
   1326         v0 = vld1q_f32_aligned_postincrement( vv );
   1327         v1 = vld1q_f32_aligned_postincrement( vv );
   1328         v2 = vld1q_f32_aligned_postincrement( vv );
   1329         v3 = vld1q_f32_aligned_postincrement( vv );
   1330 
   1331         xy0 = vmul_f32( vget_low_f32(v0), vLo);
   1332         xy1 = vmul_f32( vget_low_f32(v1), vLo);
   1333         xy2 = vmul_f32( vget_low_f32(v2), vLo);
   1334         xy3 = vmul_f32( vget_low_f32(v3), vLo);
   1335 
   1336         z0 = vtrn_f32( vget_high_f32(v0), vget_high_f32(v1));
   1337         z1 = vtrn_f32( vget_high_f32(v2), vget_high_f32(v3));
   1338         zLo = vmul_f32( z0.val[0], vHi);
   1339         zHi = vmul_f32( z1.val[0], vHi);
   1340 
   1341         rLo = vpadd_f32( xy0, xy1);
   1342         rHi = vpadd_f32( xy2, xy3);
   1343         rLo = vadd_f32(rLo, zLo);
   1344         rHi = vadd_f32(rHi, zHi);
   1345 
   1346         maskLo = vclt_f32( rLo, dotMinLo );
   1347         maskHi = vclt_f32( rHi, dotMinHi );
   1348         dotMinLo = vbsl_f32( maskLo, rLo, dotMinLo);
   1349         dotMinHi = vbsl_f32( maskHi, rHi, dotMinHi);
   1350         iLo = vbsl_u32(maskLo, indexLo, iLo);
   1351         iHi = vbsl_u32(maskHi, indexHi, iHi);
   1352         indexLo = vadd_u32(indexLo, four);
   1353         indexHi = vadd_u32(indexHi, four);
   1354     }
   1355 
   1356     for( ; i+4 <= count; i+= 4 )
   1357     {
   1358         float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
   1359         float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
   1360         float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
   1361         float32x4_t v3 = vld1q_f32_aligned_postincrement( vv );
   1362 
   1363         float32x2_t xy0 = vmul_f32( vget_low_f32(v0), vLo);
   1364         float32x2_t xy1 = vmul_f32( vget_low_f32(v1), vLo);
   1365         float32x2_t xy2 = vmul_f32( vget_low_f32(v2), vLo);
   1366         float32x2_t xy3 = vmul_f32( vget_low_f32(v3), vLo);
   1367 
   1368         float32x2x2_t z0 = vtrn_f32( vget_high_f32(v0), vget_high_f32(v1));
   1369         float32x2x2_t z1 = vtrn_f32( vget_high_f32(v2), vget_high_f32(v3));
   1370         float32x2_t zLo = vmul_f32( z0.val[0], vHi);
   1371         float32x2_t zHi = vmul_f32( z1.val[0], vHi);
   1372 
   1373         float32x2_t rLo = vpadd_f32( xy0, xy1);
   1374         float32x2_t rHi = vpadd_f32( xy2, xy3);
   1375         rLo = vadd_f32(rLo, zLo);
   1376         rHi = vadd_f32(rHi, zHi);
   1377 
   1378         uint32x2_t maskLo = vclt_f32( rLo, dotMinLo );
   1379         uint32x2_t maskHi = vclt_f32( rHi, dotMinHi );
   1380         dotMinLo = vbsl_f32( maskLo, rLo, dotMinLo);
   1381         dotMinHi = vbsl_f32( maskHi, rHi, dotMinHi);
   1382         iLo = vbsl_u32(maskLo, indexLo, iLo);
   1383         iHi = vbsl_u32(maskHi, indexHi, iHi);
   1384         indexLo = vadd_u32(indexLo, four);
   1385         indexHi = vadd_u32(indexHi, four);
   1386     }
   1387     switch( count & 3 )
   1388     {
   1389         case 3:
   1390         {
   1391             float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
   1392             float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
   1393             float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
   1394 
   1395             float32x2_t xy0 = vmul_f32( vget_low_f32(v0), vLo);
   1396             float32x2_t xy1 = vmul_f32( vget_low_f32(v1), vLo);
   1397             float32x2_t xy2 = vmul_f32( vget_low_f32(v2), vLo);
   1398 
   1399             float32x2x2_t z0 = vtrn_f32( vget_high_f32(v0), vget_high_f32(v1));
   1400             float32x2_t zLo = vmul_f32( z0.val[0], vHi);
   1401             float32x2_t zHi = vmul_f32( vdup_lane_f32(vget_high_f32(v2), 0), vHi);
   1402 
   1403             float32x2_t rLo = vpadd_f32( xy0, xy1);
   1404             float32x2_t rHi = vpadd_f32( xy2, xy2);
   1405             rLo = vadd_f32(rLo, zLo);
   1406             rHi = vadd_f32(rHi, zHi);
   1407 
   1408             uint32x2_t maskLo = vclt_f32( rLo, dotMinLo );
   1409             uint32x2_t maskHi = vclt_f32( rHi, dotMinHi );
   1410             dotMinLo = vbsl_f32( maskLo, rLo, dotMinLo);
   1411             dotMinHi = vbsl_f32( maskHi, rHi, dotMinHi);
   1412             iLo = vbsl_u32(maskLo, indexLo, iLo);
   1413             iHi = vbsl_u32(maskHi, indexHi, iHi);
   1414         }
   1415             break;
   1416         case 2:
   1417         {
   1418             float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
   1419             float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
   1420 
   1421             float32x2_t xy0 = vmul_f32( vget_low_f32(v0), vLo);
   1422             float32x2_t xy1 = vmul_f32( vget_low_f32(v1), vLo);
   1423 
   1424             float32x2x2_t z0 = vtrn_f32( vget_high_f32(v0), vget_high_f32(v1));
   1425             float32x2_t zLo = vmul_f32( z0.val[0], vHi);
   1426 
   1427             float32x2_t rLo = vpadd_f32( xy0, xy1);
   1428             rLo = vadd_f32(rLo, zLo);
   1429 
   1430             uint32x2_t maskLo = vclt_f32( rLo, dotMinLo );
   1431             dotMinLo = vbsl_f32( maskLo, rLo, dotMinLo);
   1432             iLo = vbsl_u32(maskLo, indexLo, iLo);
   1433         }
   1434             break;
   1435         case 1:
   1436         {
   1437             float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
   1438             float32x2_t xy0 = vmul_f32( vget_low_f32(v0), vLo);
   1439             float32x2_t z0 = vdup_lane_f32(vget_high_f32(v0), 0);
   1440             float32x2_t zLo = vmul_f32( z0, vHi);
   1441             float32x2_t rLo = vpadd_f32( xy0, xy0);
   1442             rLo = vadd_f32(rLo, zLo);
   1443             uint32x2_t maskLo = vclt_f32( rLo, dotMinLo );
   1444             dotMinLo = vbsl_f32( maskLo, rLo, dotMinLo);
   1445             iLo = vbsl_u32(maskLo, indexLo, iLo);
   1446         }
   1447             break;
   1448 
   1449         default:
   1450             break;
   1451     }
   1452 
   1453     // select best answer between hi and lo results
   1454     uint32x2_t mask = vclt_f32( dotMinHi, dotMinLo );
   1455     dotMinLo = vbsl_f32(mask, dotMinHi, dotMinLo);
   1456     iLo = vbsl_u32(mask, iHi, iLo);
   1457 
   1458     // select best answer between even and odd results
   1459     dotMinHi = vdup_lane_f32(dotMinLo, 1);
   1460     iHi = vdup_lane_u32(iLo, 1);
   1461     mask = vclt_f32( dotMinHi, dotMinLo );
   1462     dotMinLo = vbsl_f32(mask, dotMinHi, dotMinLo);
   1463     iLo = vbsl_u32(mask, iHi, iLo);
   1464 
   1465     *dotResult = vget_lane_f32( dotMinLo, 0);
   1466     return vget_lane_u32(iLo, 0);
   1467 }
   1468 
   1469 long _mindot_large_v1( const float *vv, const float *vec, unsigned long count, float *dotResult )
   1470 {
   1471     float32x4_t vvec = vld1q_f32_aligned_postincrement( vec );
   1472     float32x4_t vLo = vcombine_f32(vget_low_f32(vvec), vget_low_f32(vvec));
   1473     float32x4_t vHi = vdupq_lane_f32(vget_high_f32(vvec), 0);
   1474     const uint32x4_t four = (uint32x4_t){ 4, 4, 4, 4 };
   1475     uint32x4_t local_index = (uint32x4_t) {0, 1, 2, 3};
   1476     uint32x4_t index = (uint32x4_t) { static_cast<uint32_t>(-1), static_cast<uint32_t>(-1), static_cast<uint32_t>(-1), static_cast<uint32_t>(-1) };
   1477     float32x4_t minDot = (float32x4_t) { BT_INFINITY, BT_INFINITY, BT_INFINITY, BT_INFINITY };
   1478 
   1479     unsigned long i = 0;
   1480     for( ; i + 8 <= count; i += 8 )
   1481     {
   1482         float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
   1483         float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
   1484         float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
   1485         float32x4_t v3 = vld1q_f32_aligned_postincrement( vv );
   1486 
   1487         // the next two lines should resolve to a single vswp d, d
   1488         float32x4_t xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v1));
   1489         float32x4_t xy1 = vcombine_f32( vget_low_f32(v2), vget_low_f32(v3));
   1490         // the next two lines should resolve to a single vswp d, d
   1491         float32x4_t z0 = vcombine_f32( vget_high_f32(v0), vget_high_f32(v1));
   1492         float32x4_t z1 = vcombine_f32( vget_high_f32(v2), vget_high_f32(v3));
   1493 
   1494         xy0 = vmulq_f32(xy0, vLo);
   1495         xy1 = vmulq_f32(xy1, vLo);
   1496 
   1497         float32x4x2_t zb = vuzpq_f32( z0, z1);
   1498         float32x4_t z = vmulq_f32( zb.val[0], vHi);
   1499         float32x4x2_t xy = vuzpq_f32( xy0, xy1);
   1500         float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
   1501         x = vaddq_f32(x, z);
   1502 
   1503         uint32x4_t mask = vcltq_f32(x, minDot);
   1504         minDot = vbslq_f32( mask, x, minDot);
   1505         index = vbslq_u32(mask, local_index, index);
   1506         local_index = vaddq_u32(local_index, four);
   1507 
   1508         v0 = vld1q_f32_aligned_postincrement( vv );
   1509         v1 = vld1q_f32_aligned_postincrement( vv );
   1510         v2 = vld1q_f32_aligned_postincrement( vv );
   1511         v3 = vld1q_f32_aligned_postincrement( vv );
   1512 
   1513         // the next two lines should resolve to a single vswp d, d
   1514         xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v1));
   1515         xy1 = vcombine_f32( vget_low_f32(v2), vget_low_f32(v3));
   1516         // the next two lines should resolve to a single vswp d, d
   1517         z0 = vcombine_f32( vget_high_f32(v0), vget_high_f32(v1));
   1518         z1 = vcombine_f32( vget_high_f32(v2), vget_high_f32(v3));
   1519 
   1520         xy0 = vmulq_f32(xy0, vLo);
   1521         xy1 = vmulq_f32(xy1, vLo);
   1522 
   1523         zb = vuzpq_f32( z0, z1);
   1524         z = vmulq_f32( zb.val[0], vHi);
   1525         xy = vuzpq_f32( xy0, xy1);
   1526         x = vaddq_f32(xy.val[0], xy.val[1]);
   1527         x = vaddq_f32(x, z);
   1528 
   1529         mask = vcltq_f32(x, minDot);
   1530         minDot = vbslq_f32( mask, x, minDot);
   1531         index = vbslq_u32(mask, local_index, index);
   1532         local_index = vaddq_u32(local_index, four);
   1533     }
   1534 
   1535     for( ; i + 4 <= count; i += 4 )
   1536     {
   1537         float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
   1538         float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
   1539         float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
   1540         float32x4_t v3 = vld1q_f32_aligned_postincrement( vv );
   1541 
   1542         // the next two lines should resolve to a single vswp d, d
   1543         float32x4_t xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v1));
   1544         float32x4_t xy1 = vcombine_f32( vget_low_f32(v2), vget_low_f32(v3));
   1545         // the next two lines should resolve to a single vswp d, d
   1546         float32x4_t z0 = vcombine_f32( vget_high_f32(v0), vget_high_f32(v1));
   1547         float32x4_t z1 = vcombine_f32( vget_high_f32(v2), vget_high_f32(v3));
   1548 
   1549         xy0 = vmulq_f32(xy0, vLo);
   1550         xy1 = vmulq_f32(xy1, vLo);
   1551 
   1552         float32x4x2_t zb = vuzpq_f32( z0, z1);
   1553         float32x4_t z = vmulq_f32( zb.val[0], vHi);
   1554         float32x4x2_t xy = vuzpq_f32( xy0, xy1);
   1555         float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
   1556         x = vaddq_f32(x, z);
   1557 
   1558         uint32x4_t mask = vcltq_f32(x, minDot);
   1559         minDot = vbslq_f32( mask, x, minDot);
   1560         index = vbslq_u32(mask, local_index, index);
   1561         local_index = vaddq_u32(local_index, four);
   1562     }
   1563 
   1564     switch (count & 3) {
   1565         case 3:
   1566         {
   1567             float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
   1568             float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
   1569             float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
   1570 
   1571             // the next two lines should resolve to a single vswp d, d
   1572             float32x4_t xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v1));
   1573             float32x4_t xy1 = vcombine_f32( vget_low_f32(v2), vget_low_f32(v2));
   1574             // the next two lines should resolve to a single vswp d, d
   1575             float32x4_t z0 = vcombine_f32( vget_high_f32(v0), vget_high_f32(v1));
   1576             float32x4_t z1 = vcombine_f32( vget_high_f32(v2), vget_high_f32(v2));
   1577 
   1578             xy0 = vmulq_f32(xy0, vLo);
   1579             xy1 = vmulq_f32(xy1, vLo);
   1580 
   1581             float32x4x2_t zb = vuzpq_f32( z0, z1);
   1582             float32x4_t z = vmulq_f32( zb.val[0], vHi);
   1583             float32x4x2_t xy = vuzpq_f32( xy0, xy1);
   1584             float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
   1585             x = vaddq_f32(x, z);
   1586 
   1587             uint32x4_t mask = vcltq_f32(x, minDot);
   1588             minDot = vbslq_f32( mask, x, minDot);
   1589             index = vbslq_u32(mask, local_index, index);
   1590             local_index = vaddq_u32(local_index, four);
   1591         }
   1592             break;
   1593 
   1594         case 2:
   1595         {
   1596             float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
   1597             float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
   1598 
   1599             // the next two lines should resolve to a single vswp d, d
   1600             float32x4_t xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v1));
   1601             // the next two lines should resolve to a single vswp d, d
   1602             float32x4_t z0 = vcombine_f32( vget_high_f32(v0), vget_high_f32(v1));
   1603 
   1604             xy0 = vmulq_f32(xy0, vLo);
   1605 
   1606             float32x4x2_t zb = vuzpq_f32( z0, z0);
   1607             float32x4_t z = vmulq_f32( zb.val[0], vHi);
   1608             float32x4x2_t xy = vuzpq_f32( xy0, xy0);
   1609             float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
   1610             x = vaddq_f32(x, z);
   1611 
   1612             uint32x4_t mask = vcltq_f32(x, minDot);
   1613             minDot = vbslq_f32( mask, x, minDot);
   1614             index = vbslq_u32(mask, local_index, index);
   1615             local_index = vaddq_u32(local_index, four);
   1616         }
   1617             break;
   1618 
   1619         case 1:
   1620         {
   1621             float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
   1622 
   1623             // the next two lines should resolve to a single vswp d, d
   1624             float32x4_t xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v0));
   1625             // the next two lines should resolve to a single vswp d, d
   1626             float32x4_t z = vdupq_lane_f32(vget_high_f32(v0), 0);
   1627 
   1628             xy0 = vmulq_f32(xy0, vLo);
   1629 
   1630             z = vmulq_f32( z, vHi);
   1631             float32x4x2_t xy = vuzpq_f32( xy0, xy0);
   1632             float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
   1633             x = vaddq_f32(x, z);
   1634 
   1635             uint32x4_t mask = vcltq_f32(x, minDot);
   1636             minDot = vbslq_f32( mask, x, minDot);
   1637             index = vbslq_u32(mask, local_index, index);
   1638             local_index = vaddq_u32(local_index, four);
   1639         }
   1640             break;
   1641 
   1642         default:
   1643             break;
   1644     }
   1645 
   1646 
   1647     // select best answer between hi and lo results
   1648     uint32x2_t mask = vclt_f32( vget_high_f32(minDot), vget_low_f32(minDot));
   1649     float32x2_t minDot2 = vbsl_f32(mask, vget_high_f32(minDot), vget_low_f32(minDot));
   1650     uint32x2_t index2 = vbsl_u32(mask, vget_high_u32(index), vget_low_u32(index));
   1651 
   1652     // select best answer between even and odd results
   1653     float32x2_t minDotO = vdup_lane_f32(minDot2, 1);
   1654     uint32x2_t indexHi = vdup_lane_u32(index2, 1);
   1655     mask = vclt_f32( minDotO, minDot2 );
   1656     minDot2 = vbsl_f32(mask, minDotO, minDot2);
   1657     index2 = vbsl_u32(mask, indexHi, index2);
   1658 
   1659     *dotResult = vget_lane_f32( minDot2, 0);
   1660     return vget_lane_u32(index2, 0);
   1661 
   1662 }
   1663 
   1664 #else
   1665     #error Unhandled __APPLE__ arch
   1666 #endif
   1667 
   1668 #endif  /* __APPLE__ */
   1669 
   1670 
   1671