1 /* 2 Copyright (c) 2011 Apple Inc. 3 http://continuousphysics.com/Bullet/ 4 5 This software is provided 'as-is', without any express or implied warranty. 6 In no event will the authors be held liable for any damages arising from the use of this software. 7 Permission is granted to anyone to use this software for any purpose, 8 including commercial applications, and to alter it and redistribute it freely, 9 subject to the following restrictions: 10 11 1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required. 12 2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software. 13 3. This notice may not be removed or altered from any source distribution. 14 15 This source version has been altered. 16 */ 17 18 #if defined (_WIN32) || defined (__i386__) 19 #define BT_USE_SSE_IN_API 20 #endif 21 22 23 #include "btVector3.h" 24 25 26 27 #if defined BT_USE_SIMD_VECTOR3 28 29 #if DEBUG 30 #include <string.h>//for memset 31 #endif 32 33 34 #ifdef __APPLE__ 35 #include <stdint.h> 36 typedef float float4 __attribute__ ((vector_size(16))); 37 #else 38 #define float4 __m128 39 #endif 40 //typedef uint32_t uint4 __attribute__ ((vector_size(16))); 41 42 43 #if defined BT_USE_SSE || defined _WIN32 44 45 #define LOG2_ARRAY_SIZE 6 46 #define STACK_ARRAY_COUNT (1UL << LOG2_ARRAY_SIZE) 47 48 #include <emmintrin.h> 49 50 long _maxdot_large( const float *vv, const float *vec, unsigned long count, float *dotResult ); 51 long _maxdot_large( const float *vv, const float *vec, unsigned long count, float *dotResult ) 52 { 53 const float4 *vertices = (const float4*) vv; 54 static const unsigned char indexTable[16] = {(unsigned char)-1, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0 }; 55 float4 dotMax = btAssign128( -BT_INFINITY, -BT_INFINITY, -BT_INFINITY, -BT_INFINITY ); 56 float4 vvec = _mm_loadu_ps( vec ); 57 float4 vHi = btCastiTo128f(_mm_shuffle_epi32( btCastfTo128i( vvec), 0xaa )); /// zzzz 58 float4 vLo = _mm_movelh_ps( vvec, vvec ); /// xyxy 59 60 long maxIndex = -1L; 61 62 size_t segment = 0; 63 float4 stack_array[ STACK_ARRAY_COUNT ]; 64 65 #if DEBUG 66 //memset( stack_array, -1, STACK_ARRAY_COUNT * sizeof(stack_array[0]) ); 67 #endif 68 69 size_t index; 70 float4 max; 71 // Faster loop without cleanup code for full tiles 72 for ( segment = 0; segment + STACK_ARRAY_COUNT*4 <= count; segment += STACK_ARRAY_COUNT*4 ) 73 { 74 max = dotMax; 75 76 for( index = 0; index < STACK_ARRAY_COUNT; index+= 4 ) 77 { // do four dot products at a time. Carefully avoid touching the w element. 78 float4 v0 = vertices[0]; 79 float4 v1 = vertices[1]; 80 float4 v2 = vertices[2]; 81 float4 v3 = vertices[3]; vertices += 4; 82 83 float4 lo0 = _mm_movelh_ps( v0, v1); // x0y0x1y1 84 float4 hi0 = _mm_movehl_ps( v1, v0); // z0?0z1?1 85 float4 lo1 = _mm_movelh_ps( v2, v3); // x2y2x3y3 86 float4 hi1 = _mm_movehl_ps( v3, v2); // z2?2z3?3 87 88 lo0 = lo0*vLo; 89 lo1 = lo1*vLo; 90 float4 z = _mm_shuffle_ps(hi0, hi1, 0x88); 91 float4 x = _mm_shuffle_ps(lo0, lo1, 0x88); 92 float4 y = _mm_shuffle_ps(lo0, lo1, 0xdd); 93 z = z*vHi; 94 x = x+y; 95 x = x+z; 96 stack_array[index] = x; 97 max = _mm_max_ps( x, max ); // control the order here so that max is never NaN even if x is nan 98 99 v0 = vertices[0]; 100 v1 = vertices[1]; 101 v2 = vertices[2]; 102 v3 = vertices[3]; vertices += 4; 103 104 lo0 = _mm_movelh_ps( v0, v1); // x0y0x1y1 105 hi0 = _mm_movehl_ps( v1, v0); // z0?0z1?1 106 lo1 = _mm_movelh_ps( v2, v3); // x2y2x3y3 107 hi1 = _mm_movehl_ps( v3, v2); // z2?2z3?3 108 109 lo0 = lo0*vLo; 110 lo1 = lo1*vLo; 111 z = _mm_shuffle_ps(hi0, hi1, 0x88); 112 x = _mm_shuffle_ps(lo0, lo1, 0x88); 113 y = _mm_shuffle_ps(lo0, lo1, 0xdd); 114 z = z*vHi; 115 x = x+y; 116 x = x+z; 117 stack_array[index+1] = x; 118 max = _mm_max_ps( x, max ); // control the order here so that max is never NaN even if x is nan 119 120 v0 = vertices[0]; 121 v1 = vertices[1]; 122 v2 = vertices[2]; 123 v3 = vertices[3]; vertices += 4; 124 125 lo0 = _mm_movelh_ps( v0, v1); // x0y0x1y1 126 hi0 = _mm_movehl_ps( v1, v0); // z0?0z1?1 127 lo1 = _mm_movelh_ps( v2, v3); // x2y2x3y3 128 hi1 = _mm_movehl_ps( v3, v2); // z2?2z3?3 129 130 lo0 = lo0*vLo; 131 lo1 = lo1*vLo; 132 z = _mm_shuffle_ps(hi0, hi1, 0x88); 133 x = _mm_shuffle_ps(lo0, lo1, 0x88); 134 y = _mm_shuffle_ps(lo0, lo1, 0xdd); 135 z = z*vHi; 136 x = x+y; 137 x = x+z; 138 stack_array[index+2] = x; 139 max = _mm_max_ps( x, max ); // control the order here so that max is never NaN even if x is nan 140 141 v0 = vertices[0]; 142 v1 = vertices[1]; 143 v2 = vertices[2]; 144 v3 = vertices[3]; vertices += 4; 145 146 lo0 = _mm_movelh_ps( v0, v1); // x0y0x1y1 147 hi0 = _mm_movehl_ps( v1, v0); // z0?0z1?1 148 lo1 = _mm_movelh_ps( v2, v3); // x2y2x3y3 149 hi1 = _mm_movehl_ps( v3, v2); // z2?2z3?3 150 151 lo0 = lo0*vLo; 152 lo1 = lo1*vLo; 153 z = _mm_shuffle_ps(hi0, hi1, 0x88); 154 x = _mm_shuffle_ps(lo0, lo1, 0x88); 155 y = _mm_shuffle_ps(lo0, lo1, 0xdd); 156 z = z*vHi; 157 x = x+y; 158 x = x+z; 159 stack_array[index+3] = x; 160 max = _mm_max_ps( x, max ); // control the order here so that max is never NaN even if x is nan 161 162 // It is too costly to keep the index of the max here. We will look for it again later. We save a lot of work this way. 163 } 164 165 // If we found a new max 166 if( 0xf != _mm_movemask_ps( (float4) _mm_cmpeq_ps(max, dotMax))) 167 { 168 // copy the new max across all lanes of our max accumulator 169 max = _mm_max_ps(max, (float4) _mm_shuffle_ps( max, max, 0x4e)); 170 max = _mm_max_ps(max, (float4) _mm_shuffle_ps( max, max, 0xb1)); 171 172 dotMax = max; 173 174 // find first occurrence of that max 175 size_t test; 176 for( index = 0; 0 == (test=_mm_movemask_ps( _mm_cmpeq_ps( stack_array[index], max))); index++ ) // local_count must be a multiple of 4 177 {} 178 // record where it is. 179 maxIndex = 4*index + segment + indexTable[test]; 180 } 181 } 182 183 // account for work we've already done 184 count -= segment; 185 186 // Deal with the last < STACK_ARRAY_COUNT vectors 187 max = dotMax; 188 index = 0; 189 190 191 if( btUnlikely( count > 16) ) 192 { 193 for( ; index + 4 <= count / 4; index+=4 ) 194 { // do four dot products at a time. Carefully avoid touching the w element. 195 float4 v0 = vertices[0]; 196 float4 v1 = vertices[1]; 197 float4 v2 = vertices[2]; 198 float4 v3 = vertices[3]; vertices += 4; 199 200 float4 lo0 = _mm_movelh_ps( v0, v1); // x0y0x1y1 201 float4 hi0 = _mm_movehl_ps( v1, v0); // z0?0z1?1 202 float4 lo1 = _mm_movelh_ps( v2, v3); // x2y2x3y3 203 float4 hi1 = _mm_movehl_ps( v3, v2); // z2?2z3?3 204 205 lo0 = lo0*vLo; 206 lo1 = lo1*vLo; 207 float4 z = _mm_shuffle_ps(hi0, hi1, 0x88); 208 float4 x = _mm_shuffle_ps(lo0, lo1, 0x88); 209 float4 y = _mm_shuffle_ps(lo0, lo1, 0xdd); 210 z = z*vHi; 211 x = x+y; 212 x = x+z; 213 stack_array[index] = x; 214 max = _mm_max_ps( x, max ); // control the order here so that max is never NaN even if x is nan 215 216 v0 = vertices[0]; 217 v1 = vertices[1]; 218 v2 = vertices[2]; 219 v3 = vertices[3]; vertices += 4; 220 221 lo0 = _mm_movelh_ps( v0, v1); // x0y0x1y1 222 hi0 = _mm_movehl_ps( v1, v0); // z0?0z1?1 223 lo1 = _mm_movelh_ps( v2, v3); // x2y2x3y3 224 hi1 = _mm_movehl_ps( v3, v2); // z2?2z3?3 225 226 lo0 = lo0*vLo; 227 lo1 = lo1*vLo; 228 z = _mm_shuffle_ps(hi0, hi1, 0x88); 229 x = _mm_shuffle_ps(lo0, lo1, 0x88); 230 y = _mm_shuffle_ps(lo0, lo1, 0xdd); 231 z = z*vHi; 232 x = x+y; 233 x = x+z; 234 stack_array[index+1] = x; 235 max = _mm_max_ps( x, max ); // control the order here so that max is never NaN even if x is nan 236 237 v0 = vertices[0]; 238 v1 = vertices[1]; 239 v2 = vertices[2]; 240 v3 = vertices[3]; vertices += 4; 241 242 lo0 = _mm_movelh_ps( v0, v1); // x0y0x1y1 243 hi0 = _mm_movehl_ps( v1, v0); // z0?0z1?1 244 lo1 = _mm_movelh_ps( v2, v3); // x2y2x3y3 245 hi1 = _mm_movehl_ps( v3, v2); // z2?2z3?3 246 247 lo0 = lo0*vLo; 248 lo1 = lo1*vLo; 249 z = _mm_shuffle_ps(hi0, hi1, 0x88); 250 x = _mm_shuffle_ps(lo0, lo1, 0x88); 251 y = _mm_shuffle_ps(lo0, lo1, 0xdd); 252 z = z*vHi; 253 x = x+y; 254 x = x+z; 255 stack_array[index+2] = x; 256 max = _mm_max_ps( x, max ); // control the order here so that max is never NaN even if x is nan 257 258 v0 = vertices[0]; 259 v1 = vertices[1]; 260 v2 = vertices[2]; 261 v3 = vertices[3]; vertices += 4; 262 263 lo0 = _mm_movelh_ps( v0, v1); // x0y0x1y1 264 hi0 = _mm_movehl_ps( v1, v0); // z0?0z1?1 265 lo1 = _mm_movelh_ps( v2, v3); // x2y2x3y3 266 hi1 = _mm_movehl_ps( v3, v2); // z2?2z3?3 267 268 lo0 = lo0*vLo; 269 lo1 = lo1*vLo; 270 z = _mm_shuffle_ps(hi0, hi1, 0x88); 271 x = _mm_shuffle_ps(lo0, lo1, 0x88); 272 y = _mm_shuffle_ps(lo0, lo1, 0xdd); 273 z = z*vHi; 274 x = x+y; 275 x = x+z; 276 stack_array[index+3] = x; 277 max = _mm_max_ps( x, max ); // control the order here so that max is never NaN even if x is nan 278 279 // It is too costly to keep the index of the max here. We will look for it again later. We save a lot of work this way. 280 } 281 } 282 283 size_t localCount = (count & -4L) - 4*index; 284 if( localCount ) 285 { 286 #ifdef __APPLE__ 287 float4 t0, t1, t2, t3, t4; 288 float4 * sap = &stack_array[index + localCount / 4]; 289 vertices += localCount; // counter the offset 290 size_t byteIndex = -(localCount) * sizeof(float); 291 //AT&T Code style assembly 292 asm volatile 293 ( ".align 4 \n\ 294 0: movaps %[max], %[t2] // move max out of the way to avoid propagating NaNs in max \n\ 295 movaps (%[vertices], %[byteIndex], 4), %[t0] // vertices[0] \n\ 296 movaps 16(%[vertices], %[byteIndex], 4), %[t1] // vertices[1] \n\ 297 movaps %[t0], %[max] // vertices[0] \n\ 298 movlhps %[t1], %[max] // x0y0x1y1 \n\ 299 movaps 32(%[vertices], %[byteIndex], 4), %[t3] // vertices[2] \n\ 300 movaps 48(%[vertices], %[byteIndex], 4), %[t4] // vertices[3] \n\ 301 mulps %[vLo], %[max] // x0y0x1y1 * vLo \n\ 302 movhlps %[t0], %[t1] // z0w0z1w1 \n\ 303 movaps %[t3], %[t0] // vertices[2] \n\ 304 movlhps %[t4], %[t0] // x2y2x3y3 \n\ 305 mulps %[vLo], %[t0] // x2y2x3y3 * vLo \n\ 306 movhlps %[t3], %[t4] // z2w2z3w3 \n\ 307 shufps $0x88, %[t4], %[t1] // z0z1z2z3 \n\ 308 mulps %[vHi], %[t1] // z0z1z2z3 * vHi \n\ 309 movaps %[max], %[t3] // x0y0x1y1 * vLo \n\ 310 shufps $0x88, %[t0], %[max] // x0x1x2x3 * vLo.x \n\ 311 shufps $0xdd, %[t0], %[t3] // y0y1y2y3 * vLo.y \n\ 312 addps %[t3], %[max] // x + y \n\ 313 addps %[t1], %[max] // x + y + z \n\ 314 movaps %[max], (%[sap], %[byteIndex]) // record result for later scrutiny \n\ 315 maxps %[t2], %[max] // record max, restore max \n\ 316 add $16, %[byteIndex] // advance loop counter\n\ 317 jnz 0b \n\ 318 " 319 : [max] "+x" (max), [t0] "=&x" (t0), [t1] "=&x" (t1), [t2] "=&x" (t2), [t3] "=&x" (t3), [t4] "=&x" (t4), [byteIndex] "+r" (byteIndex) 320 : [vLo] "x" (vLo), [vHi] "x" (vHi), [vertices] "r" (vertices), [sap] "r" (sap) 321 : "memory", "cc" 322 ); 323 index += localCount/4; 324 #else 325 { 326 for( unsigned int i=0; i<localCount/4; i++,index++) 327 { // do four dot products at a time. Carefully avoid touching the w element. 328 float4 v0 = vertices[0]; 329 float4 v1 = vertices[1]; 330 float4 v2 = vertices[2]; 331 float4 v3 = vertices[3]; 332 vertices += 4; 333 334 float4 lo0 = _mm_movelh_ps( v0, v1); // x0y0x1y1 335 float4 hi0 = _mm_movehl_ps( v1, v0); // z0?0z1?1 336 float4 lo1 = _mm_movelh_ps( v2, v3); // x2y2x3y3 337 float4 hi1 = _mm_movehl_ps( v3, v2); // z2?2z3?3 338 339 lo0 = lo0*vLo; 340 lo1 = lo1*vLo; 341 float4 z = _mm_shuffle_ps(hi0, hi1, 0x88); 342 float4 x = _mm_shuffle_ps(lo0, lo1, 0x88); 343 float4 y = _mm_shuffle_ps(lo0, lo1, 0xdd); 344 z = z*vHi; 345 x = x+y; 346 x = x+z; 347 stack_array[index] = x; 348 max = _mm_max_ps( x, max ); // control the order here so that max is never NaN even if x is nan 349 } 350 } 351 #endif //__APPLE__ 352 } 353 354 // process the last few points 355 if( count & 3 ) 356 { 357 float4 v0, v1, v2, x, y, z; 358 switch( count & 3 ) 359 { 360 case 3: 361 { 362 v0 = vertices[0]; 363 v1 = vertices[1]; 364 v2 = vertices[2]; 365 366 // Calculate 3 dot products, transpose, duplicate v2 367 float4 lo0 = _mm_movelh_ps( v0, v1); // xyxy.lo 368 float4 hi0 = _mm_movehl_ps( v1, v0); // z?z?.lo 369 lo0 = lo0*vLo; 370 z = _mm_shuffle_ps(hi0, v2, 0xa8 ); // z0z1z2z2 371 z = z*vHi; 372 float4 lo1 = _mm_movelh_ps(v2, v2); // xyxy 373 lo1 = lo1*vLo; 374 x = _mm_shuffle_ps(lo0, lo1, 0x88); 375 y = _mm_shuffle_ps(lo0, lo1, 0xdd); 376 } 377 break; 378 case 2: 379 { 380 v0 = vertices[0]; 381 v1 = vertices[1]; 382 float4 xy = _mm_movelh_ps(v0, v1); 383 z = _mm_movehl_ps(v1, v0); 384 xy = xy*vLo; 385 z = _mm_shuffle_ps( z, z, 0xa8); 386 x = _mm_shuffle_ps( xy, xy, 0xa8); 387 y = _mm_shuffle_ps( xy, xy, 0xfd); 388 z = z*vHi; 389 } 390 break; 391 case 1: 392 { 393 float4 xy = vertices[0]; 394 z = _mm_shuffle_ps( xy, xy, 0xaa); 395 xy = xy*vLo; 396 z = z*vHi; 397 x = _mm_shuffle_ps(xy, xy, 0); 398 y = _mm_shuffle_ps(xy, xy, 0x55); 399 } 400 break; 401 } 402 x = x+y; 403 x = x+z; 404 stack_array[index] = x; 405 max = _mm_max_ps( x, max ); // control the order here so that max is never NaN even if x is nan 406 index++; 407 } 408 409 // if we found a new max. 410 if( 0 == segment || 0xf != _mm_movemask_ps( (float4) _mm_cmpeq_ps(max, dotMax))) 411 { // we found a new max. Search for it 412 // find max across the max vector, place in all elements of max -- big latency hit here 413 max = _mm_max_ps(max, (float4) _mm_shuffle_ps( max, max, 0x4e)); 414 max = _mm_max_ps(max, (float4) _mm_shuffle_ps( max, max, 0xb1)); 415 416 // It is slightly faster to do this part in scalar code when count < 8. However, the common case for 417 // this where it actually makes a difference is handled in the early out at the top of the function, 418 // so it is less than a 1% difference here. I opted for improved code size, fewer branches and reduced 419 // complexity, and removed it. 420 421 dotMax = max; 422 423 // scan for the first occurence of max in the array 424 size_t test; 425 for( index = 0; 0 == (test=_mm_movemask_ps( _mm_cmpeq_ps( stack_array[index], max))); index++ ) // local_count must be a multiple of 4 426 {} 427 maxIndex = 4*index + segment + indexTable[test]; 428 } 429 430 _mm_store_ss( dotResult, dotMax); 431 return maxIndex; 432 } 433 434 long _mindot_large( const float *vv, const float *vec, unsigned long count, float *dotResult ); 435 436 long _mindot_large( const float *vv, const float *vec, unsigned long count, float *dotResult ) 437 { 438 const float4 *vertices = (const float4*) vv; 439 static const unsigned char indexTable[16] = {(unsigned char)-1, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0 }; 440 float4 dotmin = btAssign128( BT_INFINITY, BT_INFINITY, BT_INFINITY, BT_INFINITY ); 441 float4 vvec = _mm_loadu_ps( vec ); 442 float4 vHi = btCastiTo128f(_mm_shuffle_epi32( btCastfTo128i( vvec), 0xaa )); /// zzzz 443 float4 vLo = _mm_movelh_ps( vvec, vvec ); /// xyxy 444 445 long minIndex = -1L; 446 447 size_t segment = 0; 448 float4 stack_array[ STACK_ARRAY_COUNT ]; 449 450 #if DEBUG 451 //memset( stack_array, -1, STACK_ARRAY_COUNT * sizeof(stack_array[0]) ); 452 #endif 453 454 size_t index; 455 float4 min; 456 // Faster loop without cleanup code for full tiles 457 for ( segment = 0; segment + STACK_ARRAY_COUNT*4 <= count; segment += STACK_ARRAY_COUNT*4 ) 458 { 459 min = dotmin; 460 461 for( index = 0; index < STACK_ARRAY_COUNT; index+= 4 ) 462 { // do four dot products at a time. Carefully avoid touching the w element. 463 float4 v0 = vertices[0]; 464 float4 v1 = vertices[1]; 465 float4 v2 = vertices[2]; 466 float4 v3 = vertices[3]; vertices += 4; 467 468 float4 lo0 = _mm_movelh_ps( v0, v1); // x0y0x1y1 469 float4 hi0 = _mm_movehl_ps( v1, v0); // z0?0z1?1 470 float4 lo1 = _mm_movelh_ps( v2, v3); // x2y2x3y3 471 float4 hi1 = _mm_movehl_ps( v3, v2); // z2?2z3?3 472 473 lo0 = lo0*vLo; 474 lo1 = lo1*vLo; 475 float4 z = _mm_shuffle_ps(hi0, hi1, 0x88); 476 float4 x = _mm_shuffle_ps(lo0, lo1, 0x88); 477 float4 y = _mm_shuffle_ps(lo0, lo1, 0xdd); 478 z = z*vHi; 479 x = x+y; 480 x = x+z; 481 stack_array[index] = x; 482 min = _mm_min_ps( x, min ); // control the order here so that min is never NaN even if x is nan 483 484 v0 = vertices[0]; 485 v1 = vertices[1]; 486 v2 = vertices[2]; 487 v3 = vertices[3]; vertices += 4; 488 489 lo0 = _mm_movelh_ps( v0, v1); // x0y0x1y1 490 hi0 = _mm_movehl_ps( v1, v0); // z0?0z1?1 491 lo1 = _mm_movelh_ps( v2, v3); // x2y2x3y3 492 hi1 = _mm_movehl_ps( v3, v2); // z2?2z3?3 493 494 lo0 = lo0*vLo; 495 lo1 = lo1*vLo; 496 z = _mm_shuffle_ps(hi0, hi1, 0x88); 497 x = _mm_shuffle_ps(lo0, lo1, 0x88); 498 y = _mm_shuffle_ps(lo0, lo1, 0xdd); 499 z = z*vHi; 500 x = x+y; 501 x = x+z; 502 stack_array[index+1] = x; 503 min = _mm_min_ps( x, min ); // control the order here so that min is never NaN even if x is nan 504 505 v0 = vertices[0]; 506 v1 = vertices[1]; 507 v2 = vertices[2]; 508 v3 = vertices[3]; vertices += 4; 509 510 lo0 = _mm_movelh_ps( v0, v1); // x0y0x1y1 511 hi0 = _mm_movehl_ps( v1, v0); // z0?0z1?1 512 lo1 = _mm_movelh_ps( v2, v3); // x2y2x3y3 513 hi1 = _mm_movehl_ps( v3, v2); // z2?2z3?3 514 515 lo0 = lo0*vLo; 516 lo1 = lo1*vLo; 517 z = _mm_shuffle_ps(hi0, hi1, 0x88); 518 x = _mm_shuffle_ps(lo0, lo1, 0x88); 519 y = _mm_shuffle_ps(lo0, lo1, 0xdd); 520 z = z*vHi; 521 x = x+y; 522 x = x+z; 523 stack_array[index+2] = x; 524 min = _mm_min_ps( x, min ); // control the order here so that min is never NaN even if x is nan 525 526 v0 = vertices[0]; 527 v1 = vertices[1]; 528 v2 = vertices[2]; 529 v3 = vertices[3]; vertices += 4; 530 531 lo0 = _mm_movelh_ps( v0, v1); // x0y0x1y1 532 hi0 = _mm_movehl_ps( v1, v0); // z0?0z1?1 533 lo1 = _mm_movelh_ps( v2, v3); // x2y2x3y3 534 hi1 = _mm_movehl_ps( v3, v2); // z2?2z3?3 535 536 lo0 = lo0*vLo; 537 lo1 = lo1*vLo; 538 z = _mm_shuffle_ps(hi0, hi1, 0x88); 539 x = _mm_shuffle_ps(lo0, lo1, 0x88); 540 y = _mm_shuffle_ps(lo0, lo1, 0xdd); 541 z = z*vHi; 542 x = x+y; 543 x = x+z; 544 stack_array[index+3] = x; 545 min = _mm_min_ps( x, min ); // control the order here so that min is never NaN even if x is nan 546 547 // It is too costly to keep the index of the min here. We will look for it again later. We save a lot of work this way. 548 } 549 550 // If we found a new min 551 if( 0xf != _mm_movemask_ps( (float4) _mm_cmpeq_ps(min, dotmin))) 552 { 553 // copy the new min across all lanes of our min accumulator 554 min = _mm_min_ps(min, (float4) _mm_shuffle_ps( min, min, 0x4e)); 555 min = _mm_min_ps(min, (float4) _mm_shuffle_ps( min, min, 0xb1)); 556 557 dotmin = min; 558 559 // find first occurrence of that min 560 size_t test; 561 for( index = 0; 0 == (test=_mm_movemask_ps( _mm_cmpeq_ps( stack_array[index], min))); index++ ) // local_count must be a multiple of 4 562 {} 563 // record where it is. 564 minIndex = 4*index + segment + indexTable[test]; 565 } 566 } 567 568 // account for work we've already done 569 count -= segment; 570 571 // Deal with the last < STACK_ARRAY_COUNT vectors 572 min = dotmin; 573 index = 0; 574 575 576 if(btUnlikely( count > 16) ) 577 { 578 for( ; index + 4 <= count / 4; index+=4 ) 579 { // do four dot products at a time. Carefully avoid touching the w element. 580 float4 v0 = vertices[0]; 581 float4 v1 = vertices[1]; 582 float4 v2 = vertices[2]; 583 float4 v3 = vertices[3]; vertices += 4; 584 585 float4 lo0 = _mm_movelh_ps( v0, v1); // x0y0x1y1 586 float4 hi0 = _mm_movehl_ps( v1, v0); // z0?0z1?1 587 float4 lo1 = _mm_movelh_ps( v2, v3); // x2y2x3y3 588 float4 hi1 = _mm_movehl_ps( v3, v2); // z2?2z3?3 589 590 lo0 = lo0*vLo; 591 lo1 = lo1*vLo; 592 float4 z = _mm_shuffle_ps(hi0, hi1, 0x88); 593 float4 x = _mm_shuffle_ps(lo0, lo1, 0x88); 594 float4 y = _mm_shuffle_ps(lo0, lo1, 0xdd); 595 z = z*vHi; 596 x = x+y; 597 x = x+z; 598 stack_array[index] = x; 599 min = _mm_min_ps( x, min ); // control the order here so that min is never NaN even if x is nan 600 601 v0 = vertices[0]; 602 v1 = vertices[1]; 603 v2 = vertices[2]; 604 v3 = vertices[3]; vertices += 4; 605 606 lo0 = _mm_movelh_ps( v0, v1); // x0y0x1y1 607 hi0 = _mm_movehl_ps( v1, v0); // z0?0z1?1 608 lo1 = _mm_movelh_ps( v2, v3); // x2y2x3y3 609 hi1 = _mm_movehl_ps( v3, v2); // z2?2z3?3 610 611 lo0 = lo0*vLo; 612 lo1 = lo1*vLo; 613 z = _mm_shuffle_ps(hi0, hi1, 0x88); 614 x = _mm_shuffle_ps(lo0, lo1, 0x88); 615 y = _mm_shuffle_ps(lo0, lo1, 0xdd); 616 z = z*vHi; 617 x = x+y; 618 x = x+z; 619 stack_array[index+1] = x; 620 min = _mm_min_ps( x, min ); // control the order here so that min is never NaN even if x is nan 621 622 v0 = vertices[0]; 623 v1 = vertices[1]; 624 v2 = vertices[2]; 625 v3 = vertices[3]; vertices += 4; 626 627 lo0 = _mm_movelh_ps( v0, v1); // x0y0x1y1 628 hi0 = _mm_movehl_ps( v1, v0); // z0?0z1?1 629 lo1 = _mm_movelh_ps( v2, v3); // x2y2x3y3 630 hi1 = _mm_movehl_ps( v3, v2); // z2?2z3?3 631 632 lo0 = lo0*vLo; 633 lo1 = lo1*vLo; 634 z = _mm_shuffle_ps(hi0, hi1, 0x88); 635 x = _mm_shuffle_ps(lo0, lo1, 0x88); 636 y = _mm_shuffle_ps(lo0, lo1, 0xdd); 637 z = z*vHi; 638 x = x+y; 639 x = x+z; 640 stack_array[index+2] = x; 641 min = _mm_min_ps( x, min ); // control the order here so that min is never NaN even if x is nan 642 643 v0 = vertices[0]; 644 v1 = vertices[1]; 645 v2 = vertices[2]; 646 v3 = vertices[3]; vertices += 4; 647 648 lo0 = _mm_movelh_ps( v0, v1); // x0y0x1y1 649 hi0 = _mm_movehl_ps( v1, v0); // z0?0z1?1 650 lo1 = _mm_movelh_ps( v2, v3); // x2y2x3y3 651 hi1 = _mm_movehl_ps( v3, v2); // z2?2z3?3 652 653 lo0 = lo0*vLo; 654 lo1 = lo1*vLo; 655 z = _mm_shuffle_ps(hi0, hi1, 0x88); 656 x = _mm_shuffle_ps(lo0, lo1, 0x88); 657 y = _mm_shuffle_ps(lo0, lo1, 0xdd); 658 z = z*vHi; 659 x = x+y; 660 x = x+z; 661 stack_array[index+3] = x; 662 min = _mm_min_ps( x, min ); // control the order here so that min is never NaN even if x is nan 663 664 // It is too costly to keep the index of the min here. We will look for it again later. We save a lot of work this way. 665 } 666 } 667 668 size_t localCount = (count & -4L) - 4*index; 669 if( localCount ) 670 { 671 672 673 #ifdef __APPLE__ 674 vertices += localCount; // counter the offset 675 float4 t0, t1, t2, t3, t4; 676 size_t byteIndex = -(localCount) * sizeof(float); 677 float4 * sap = &stack_array[index + localCount / 4]; 678 679 asm volatile 680 ( ".align 4 \n\ 681 0: movaps %[min], %[t2] // move min out of the way to avoid propagating NaNs in min \n\ 682 movaps (%[vertices], %[byteIndex], 4), %[t0] // vertices[0] \n\ 683 movaps 16(%[vertices], %[byteIndex], 4), %[t1] // vertices[1] \n\ 684 movaps %[t0], %[min] // vertices[0] \n\ 685 movlhps %[t1], %[min] // x0y0x1y1 \n\ 686 movaps 32(%[vertices], %[byteIndex], 4), %[t3] // vertices[2] \n\ 687 movaps 48(%[vertices], %[byteIndex], 4), %[t4] // vertices[3] \n\ 688 mulps %[vLo], %[min] // x0y0x1y1 * vLo \n\ 689 movhlps %[t0], %[t1] // z0w0z1w1 \n\ 690 movaps %[t3], %[t0] // vertices[2] \n\ 691 movlhps %[t4], %[t0] // x2y2x3y3 \n\ 692 movhlps %[t3], %[t4] // z2w2z3w3 \n\ 693 mulps %[vLo], %[t0] // x2y2x3y3 * vLo \n\ 694 shufps $0x88, %[t4], %[t1] // z0z1z2z3 \n\ 695 mulps %[vHi], %[t1] // z0z1z2z3 * vHi \n\ 696 movaps %[min], %[t3] // x0y0x1y1 * vLo \n\ 697 shufps $0x88, %[t0], %[min] // x0x1x2x3 * vLo.x \n\ 698 shufps $0xdd, %[t0], %[t3] // y0y1y2y3 * vLo.y \n\ 699 addps %[t3], %[min] // x + y \n\ 700 addps %[t1], %[min] // x + y + z \n\ 701 movaps %[min], (%[sap], %[byteIndex]) // record result for later scrutiny \n\ 702 minps %[t2], %[min] // record min, restore min \n\ 703 add $16, %[byteIndex] // advance loop counter\n\ 704 jnz 0b \n\ 705 " 706 : [min] "+x" (min), [t0] "=&x" (t0), [t1] "=&x" (t1), [t2] "=&x" (t2), [t3] "=&x" (t3), [t4] "=&x" (t4), [byteIndex] "+r" (byteIndex) 707 : [vLo] "x" (vLo), [vHi] "x" (vHi), [vertices] "r" (vertices), [sap] "r" (sap) 708 : "memory", "cc" 709 ); 710 index += localCount/4; 711 #else 712 { 713 for( unsigned int i=0; i<localCount/4; i++,index++) 714 { // do four dot products at a time. Carefully avoid touching the w element. 715 float4 v0 = vertices[0]; 716 float4 v1 = vertices[1]; 717 float4 v2 = vertices[2]; 718 float4 v3 = vertices[3]; 719 vertices += 4; 720 721 float4 lo0 = _mm_movelh_ps( v0, v1); // x0y0x1y1 722 float4 hi0 = _mm_movehl_ps( v1, v0); // z0?0z1?1 723 float4 lo1 = _mm_movelh_ps( v2, v3); // x2y2x3y3 724 float4 hi1 = _mm_movehl_ps( v3, v2); // z2?2z3?3 725 726 lo0 = lo0*vLo; 727 lo1 = lo1*vLo; 728 float4 z = _mm_shuffle_ps(hi0, hi1, 0x88); 729 float4 x = _mm_shuffle_ps(lo0, lo1, 0x88); 730 float4 y = _mm_shuffle_ps(lo0, lo1, 0xdd); 731 z = z*vHi; 732 x = x+y; 733 x = x+z; 734 stack_array[index] = x; 735 min = _mm_min_ps( x, min ); // control the order here so that max is never NaN even if x is nan 736 } 737 } 738 739 #endif 740 } 741 742 // process the last few points 743 if( count & 3 ) 744 { 745 float4 v0, v1, v2, x, y, z; 746 switch( count & 3 ) 747 { 748 case 3: 749 { 750 v0 = vertices[0]; 751 v1 = vertices[1]; 752 v2 = vertices[2]; 753 754 // Calculate 3 dot products, transpose, duplicate v2 755 float4 lo0 = _mm_movelh_ps( v0, v1); // xyxy.lo 756 float4 hi0 = _mm_movehl_ps( v1, v0); // z?z?.lo 757 lo0 = lo0*vLo; 758 z = _mm_shuffle_ps(hi0, v2, 0xa8 ); // z0z1z2z2 759 z = z*vHi; 760 float4 lo1 = _mm_movelh_ps(v2, v2); // xyxy 761 lo1 = lo1*vLo; 762 x = _mm_shuffle_ps(lo0, lo1, 0x88); 763 y = _mm_shuffle_ps(lo0, lo1, 0xdd); 764 } 765 break; 766 case 2: 767 { 768 v0 = vertices[0]; 769 v1 = vertices[1]; 770 float4 xy = _mm_movelh_ps(v0, v1); 771 z = _mm_movehl_ps(v1, v0); 772 xy = xy*vLo; 773 z = _mm_shuffle_ps( z, z, 0xa8); 774 x = _mm_shuffle_ps( xy, xy, 0xa8); 775 y = _mm_shuffle_ps( xy, xy, 0xfd); 776 z = z*vHi; 777 } 778 break; 779 case 1: 780 { 781 float4 xy = vertices[0]; 782 z = _mm_shuffle_ps( xy, xy, 0xaa); 783 xy = xy*vLo; 784 z = z*vHi; 785 x = _mm_shuffle_ps(xy, xy, 0); 786 y = _mm_shuffle_ps(xy, xy, 0x55); 787 } 788 break; 789 } 790 x = x+y; 791 x = x+z; 792 stack_array[index] = x; 793 min = _mm_min_ps( x, min ); // control the order here so that min is never NaN even if x is nan 794 index++; 795 } 796 797 // if we found a new min. 798 if( 0 == segment || 0xf != _mm_movemask_ps( (float4) _mm_cmpeq_ps(min, dotmin))) 799 { // we found a new min. Search for it 800 // find min across the min vector, place in all elements of min -- big latency hit here 801 min = _mm_min_ps(min, (float4) _mm_shuffle_ps( min, min, 0x4e)); 802 min = _mm_min_ps(min, (float4) _mm_shuffle_ps( min, min, 0xb1)); 803 804 // It is slightly faster to do this part in scalar code when count < 8. However, the common case for 805 // this where it actually makes a difference is handled in the early out at the top of the function, 806 // so it is less than a 1% difference here. I opted for improved code size, fewer branches and reduced 807 // complexity, and removed it. 808 809 dotmin = min; 810 811 // scan for the first occurence of min in the array 812 size_t test; 813 for( index = 0; 0 == (test=_mm_movemask_ps( _mm_cmpeq_ps( stack_array[index], min))); index++ ) // local_count must be a multiple of 4 814 {} 815 minIndex = 4*index + segment + indexTable[test]; 816 } 817 818 _mm_store_ss( dotResult, dotmin); 819 return minIndex; 820 } 821 822 823 #elif defined BT_USE_NEON 824 825 #define ARM_NEON_GCC_COMPATIBILITY 1 826 #include <arm_neon.h> 827 #include <sys/types.h> 828 #include <sys/sysctl.h> //for sysctlbyname 829 830 static long _maxdot_large_v0( const float *vv, const float *vec, unsigned long count, float *dotResult ); 831 static long _maxdot_large_v1( const float *vv, const float *vec, unsigned long count, float *dotResult ); 832 static long _maxdot_large_sel( const float *vv, const float *vec, unsigned long count, float *dotResult ); 833 static long _mindot_large_v0( const float *vv, const float *vec, unsigned long count, float *dotResult ); 834 static long _mindot_large_v1( const float *vv, const float *vec, unsigned long count, float *dotResult ); 835 static long _mindot_large_sel( const float *vv, const float *vec, unsigned long count, float *dotResult ); 836 837 long (*_maxdot_large)( const float *vv, const float *vec, unsigned long count, float *dotResult ) = _maxdot_large_sel; 838 long (*_mindot_large)( const float *vv, const float *vec, unsigned long count, float *dotResult ) = _mindot_large_sel; 839 840 841 static inline uint32_t btGetCpuCapabilities( void ) 842 { 843 static uint32_t capabilities = 0; 844 static bool testedCapabilities = false; 845 846 if( 0 == testedCapabilities) 847 { 848 uint32_t hasFeature = 0; 849 size_t featureSize = sizeof( hasFeature ); 850 int err = sysctlbyname( "hw.optional.neon_hpfp", &hasFeature, &featureSize, NULL, 0 ); 851 852 if( 0 == err && hasFeature) 853 capabilities |= 0x2000; 854 855 testedCapabilities = true; 856 } 857 858 return capabilities; 859 } 860 861 862 863 864 static long _maxdot_large_sel( const float *vv, const float *vec, unsigned long count, float *dotResult ) 865 { 866 867 if( btGetCpuCapabilities() & 0x2000 ) 868 _maxdot_large = _maxdot_large_v1; 869 else 870 _maxdot_large = _maxdot_large_v0; 871 872 return _maxdot_large(vv, vec, count, dotResult); 873 } 874 875 static long _mindot_large_sel( const float *vv, const float *vec, unsigned long count, float *dotResult ) 876 { 877 878 if( btGetCpuCapabilities() & 0x2000 ) 879 _mindot_large = _mindot_large_v1; 880 else 881 _mindot_large = _mindot_large_v0; 882 883 return _mindot_large(vv, vec, count, dotResult); 884 } 885 886 887 888 #if defined __arm__ 889 # define vld1q_f32_aligned_postincrement( _ptr ) ({ float32x4_t _r; asm( "vld1.f32 {%0}, [%1, :128]!\n" : "=w" (_r), "+r" (_ptr) ); /*return*/ _r; }) 890 #else 891 //support 64bit arm 892 # define vld1q_f32_aligned_postincrement( _ptr) ({ float32x4_t _r = ((float32x4_t*)(_ptr))[0]; (_ptr) = (const float*) ((const char*)(_ptr) + 16L); /*return*/ _r; }) 893 #endif 894 895 896 long _maxdot_large_v0( const float *vv, const float *vec, unsigned long count, float *dotResult ) 897 { 898 unsigned long i = 0; 899 float32x4_t vvec = vld1q_f32_aligned_postincrement( vec ); 900 float32x2_t vLo = vget_low_f32(vvec); 901 float32x2_t vHi = vdup_lane_f32(vget_high_f32(vvec), 0); 902 float32x2_t dotMaxLo = (float32x2_t) { -BT_INFINITY, -BT_INFINITY }; 903 float32x2_t dotMaxHi = (float32x2_t) { -BT_INFINITY, -BT_INFINITY }; 904 uint32x2_t indexLo = (uint32x2_t) {0, 1}; 905 uint32x2_t indexHi = (uint32x2_t) {2, 3}; 906 uint32x2_t iLo = (uint32x2_t) {static_cast<uint32_t>(-1), static_cast<uint32_t>(-1)}; 907 uint32x2_t iHi = (uint32x2_t) {static_cast<uint32_t>(-1), static_cast<uint32_t>(-1)}; 908 const uint32x2_t four = (uint32x2_t) {4,4}; 909 910 for( ; i+8 <= count; i+= 8 ) 911 { 912 float32x4_t v0 = vld1q_f32_aligned_postincrement( vv ); 913 float32x4_t v1 = vld1q_f32_aligned_postincrement( vv ); 914 float32x4_t v2 = vld1q_f32_aligned_postincrement( vv ); 915 float32x4_t v3 = vld1q_f32_aligned_postincrement( vv ); 916 917 float32x2_t xy0 = vmul_f32( vget_low_f32(v0), vLo); 918 float32x2_t xy1 = vmul_f32( vget_low_f32(v1), vLo); 919 float32x2_t xy2 = vmul_f32( vget_low_f32(v2), vLo); 920 float32x2_t xy3 = vmul_f32( vget_low_f32(v3), vLo); 921 922 float32x2x2_t z0 = vtrn_f32( vget_high_f32(v0), vget_high_f32(v1)); 923 float32x2x2_t z1 = vtrn_f32( vget_high_f32(v2), vget_high_f32(v3)); 924 float32x2_t zLo = vmul_f32( z0.val[0], vHi); 925 float32x2_t zHi = vmul_f32( z1.val[0], vHi); 926 927 float32x2_t rLo = vpadd_f32( xy0, xy1); 928 float32x2_t rHi = vpadd_f32( xy2, xy3); 929 rLo = vadd_f32(rLo, zLo); 930 rHi = vadd_f32(rHi, zHi); 931 932 uint32x2_t maskLo = vcgt_f32( rLo, dotMaxLo ); 933 uint32x2_t maskHi = vcgt_f32( rHi, dotMaxHi ); 934 dotMaxLo = vbsl_f32( maskLo, rLo, dotMaxLo); 935 dotMaxHi = vbsl_f32( maskHi, rHi, dotMaxHi); 936 iLo = vbsl_u32(maskLo, indexLo, iLo); 937 iHi = vbsl_u32(maskHi, indexHi, iHi); 938 indexLo = vadd_u32(indexLo, four); 939 indexHi = vadd_u32(indexHi, four); 940 941 v0 = vld1q_f32_aligned_postincrement( vv ); 942 v1 = vld1q_f32_aligned_postincrement( vv ); 943 v2 = vld1q_f32_aligned_postincrement( vv ); 944 v3 = vld1q_f32_aligned_postincrement( vv ); 945 946 xy0 = vmul_f32( vget_low_f32(v0), vLo); 947 xy1 = vmul_f32( vget_low_f32(v1), vLo); 948 xy2 = vmul_f32( vget_low_f32(v2), vLo); 949 xy3 = vmul_f32( vget_low_f32(v3), vLo); 950 951 z0 = vtrn_f32( vget_high_f32(v0), vget_high_f32(v1)); 952 z1 = vtrn_f32( vget_high_f32(v2), vget_high_f32(v3)); 953 zLo = vmul_f32( z0.val[0], vHi); 954 zHi = vmul_f32( z1.val[0], vHi); 955 956 rLo = vpadd_f32( xy0, xy1); 957 rHi = vpadd_f32( xy2, xy3); 958 rLo = vadd_f32(rLo, zLo); 959 rHi = vadd_f32(rHi, zHi); 960 961 maskLo = vcgt_f32( rLo, dotMaxLo ); 962 maskHi = vcgt_f32( rHi, dotMaxHi ); 963 dotMaxLo = vbsl_f32( maskLo, rLo, dotMaxLo); 964 dotMaxHi = vbsl_f32( maskHi, rHi, dotMaxHi); 965 iLo = vbsl_u32(maskLo, indexLo, iLo); 966 iHi = vbsl_u32(maskHi, indexHi, iHi); 967 indexLo = vadd_u32(indexLo, four); 968 indexHi = vadd_u32(indexHi, four); 969 } 970 971 for( ; i+4 <= count; i+= 4 ) 972 { 973 float32x4_t v0 = vld1q_f32_aligned_postincrement( vv ); 974 float32x4_t v1 = vld1q_f32_aligned_postincrement( vv ); 975 float32x4_t v2 = vld1q_f32_aligned_postincrement( vv ); 976 float32x4_t v3 = vld1q_f32_aligned_postincrement( vv ); 977 978 float32x2_t xy0 = vmul_f32( vget_low_f32(v0), vLo); 979 float32x2_t xy1 = vmul_f32( vget_low_f32(v1), vLo); 980 float32x2_t xy2 = vmul_f32( vget_low_f32(v2), vLo); 981 float32x2_t xy3 = vmul_f32( vget_low_f32(v3), vLo); 982 983 float32x2x2_t z0 = vtrn_f32( vget_high_f32(v0), vget_high_f32(v1)); 984 float32x2x2_t z1 = vtrn_f32( vget_high_f32(v2), vget_high_f32(v3)); 985 float32x2_t zLo = vmul_f32( z0.val[0], vHi); 986 float32x2_t zHi = vmul_f32( z1.val[0], vHi); 987 988 float32x2_t rLo = vpadd_f32( xy0, xy1); 989 float32x2_t rHi = vpadd_f32( xy2, xy3); 990 rLo = vadd_f32(rLo, zLo); 991 rHi = vadd_f32(rHi, zHi); 992 993 uint32x2_t maskLo = vcgt_f32( rLo, dotMaxLo ); 994 uint32x2_t maskHi = vcgt_f32( rHi, dotMaxHi ); 995 dotMaxLo = vbsl_f32( maskLo, rLo, dotMaxLo); 996 dotMaxHi = vbsl_f32( maskHi, rHi, dotMaxHi); 997 iLo = vbsl_u32(maskLo, indexLo, iLo); 998 iHi = vbsl_u32(maskHi, indexHi, iHi); 999 indexLo = vadd_u32(indexLo, four); 1000 indexHi = vadd_u32(indexHi, four); 1001 } 1002 1003 switch( count & 3 ) 1004 { 1005 case 3: 1006 { 1007 float32x4_t v0 = vld1q_f32_aligned_postincrement( vv ); 1008 float32x4_t v1 = vld1q_f32_aligned_postincrement( vv ); 1009 float32x4_t v2 = vld1q_f32_aligned_postincrement( vv ); 1010 1011 float32x2_t xy0 = vmul_f32( vget_low_f32(v0), vLo); 1012 float32x2_t xy1 = vmul_f32( vget_low_f32(v1), vLo); 1013 float32x2_t xy2 = vmul_f32( vget_low_f32(v2), vLo); 1014 1015 float32x2x2_t z0 = vtrn_f32( vget_high_f32(v0), vget_high_f32(v1)); 1016 float32x2_t zLo = vmul_f32( z0.val[0], vHi); 1017 float32x2_t zHi = vmul_f32( vdup_lane_f32(vget_high_f32(v2), 0), vHi); 1018 1019 float32x2_t rLo = vpadd_f32( xy0, xy1); 1020 float32x2_t rHi = vpadd_f32( xy2, xy2); 1021 rLo = vadd_f32(rLo, zLo); 1022 rHi = vadd_f32(rHi, zHi); 1023 1024 uint32x2_t maskLo = vcgt_f32( rLo, dotMaxLo ); 1025 uint32x2_t maskHi = vcgt_f32( rHi, dotMaxHi ); 1026 dotMaxLo = vbsl_f32( maskLo, rLo, dotMaxLo); 1027 dotMaxHi = vbsl_f32( maskHi, rHi, dotMaxHi); 1028 iLo = vbsl_u32(maskLo, indexLo, iLo); 1029 iHi = vbsl_u32(maskHi, indexHi, iHi); 1030 } 1031 break; 1032 case 2: 1033 { 1034 float32x4_t v0 = vld1q_f32_aligned_postincrement( vv ); 1035 float32x4_t v1 = vld1q_f32_aligned_postincrement( vv ); 1036 1037 float32x2_t xy0 = vmul_f32( vget_low_f32(v0), vLo); 1038 float32x2_t xy1 = vmul_f32( vget_low_f32(v1), vLo); 1039 1040 float32x2x2_t z0 = vtrn_f32( vget_high_f32(v0), vget_high_f32(v1)); 1041 float32x2_t zLo = vmul_f32( z0.val[0], vHi); 1042 1043 float32x2_t rLo = vpadd_f32( xy0, xy1); 1044 rLo = vadd_f32(rLo, zLo); 1045 1046 uint32x2_t maskLo = vcgt_f32( rLo, dotMaxLo ); 1047 dotMaxLo = vbsl_f32( maskLo, rLo, dotMaxLo); 1048 iLo = vbsl_u32(maskLo, indexLo, iLo); 1049 } 1050 break; 1051 case 1: 1052 { 1053 float32x4_t v0 = vld1q_f32_aligned_postincrement( vv ); 1054 float32x2_t xy0 = vmul_f32( vget_low_f32(v0), vLo); 1055 float32x2_t z0 = vdup_lane_f32(vget_high_f32(v0), 0); 1056 float32x2_t zLo = vmul_f32( z0, vHi); 1057 float32x2_t rLo = vpadd_f32( xy0, xy0); 1058 rLo = vadd_f32(rLo, zLo); 1059 uint32x2_t maskLo = vcgt_f32( rLo, dotMaxLo ); 1060 dotMaxLo = vbsl_f32( maskLo, rLo, dotMaxLo); 1061 iLo = vbsl_u32(maskLo, indexLo, iLo); 1062 } 1063 break; 1064 1065 default: 1066 break; 1067 } 1068 1069 // select best answer between hi and lo results 1070 uint32x2_t mask = vcgt_f32( dotMaxHi, dotMaxLo ); 1071 dotMaxLo = vbsl_f32(mask, dotMaxHi, dotMaxLo); 1072 iLo = vbsl_u32(mask, iHi, iLo); 1073 1074 // select best answer between even and odd results 1075 dotMaxHi = vdup_lane_f32(dotMaxLo, 1); 1076 iHi = vdup_lane_u32(iLo, 1); 1077 mask = vcgt_f32( dotMaxHi, dotMaxLo ); 1078 dotMaxLo = vbsl_f32(mask, dotMaxHi, dotMaxLo); 1079 iLo = vbsl_u32(mask, iHi, iLo); 1080 1081 *dotResult = vget_lane_f32( dotMaxLo, 0); 1082 return vget_lane_u32(iLo, 0); 1083 } 1084 1085 1086 long _maxdot_large_v1( const float *vv, const float *vec, unsigned long count, float *dotResult ) 1087 { 1088 float32x4_t vvec = vld1q_f32_aligned_postincrement( vec ); 1089 float32x4_t vLo = vcombine_f32(vget_low_f32(vvec), vget_low_f32(vvec)); 1090 float32x4_t vHi = vdupq_lane_f32(vget_high_f32(vvec), 0); 1091 const uint32x4_t four = (uint32x4_t){ 4, 4, 4, 4 }; 1092 uint32x4_t local_index = (uint32x4_t) {0, 1, 2, 3}; 1093 uint32x4_t index = (uint32x4_t) { static_cast<uint32_t>(-1), static_cast<uint32_t>(-1), static_cast<uint32_t>(-1), static_cast<uint32_t>(-1) }; 1094 float32x4_t maxDot = (float32x4_t) { -BT_INFINITY, -BT_INFINITY, -BT_INFINITY, -BT_INFINITY }; 1095 1096 unsigned long i = 0; 1097 for( ; i + 8 <= count; i += 8 ) 1098 { 1099 float32x4_t v0 = vld1q_f32_aligned_postincrement( vv ); 1100 float32x4_t v1 = vld1q_f32_aligned_postincrement( vv ); 1101 float32x4_t v2 = vld1q_f32_aligned_postincrement( vv ); 1102 float32x4_t v3 = vld1q_f32_aligned_postincrement( vv ); 1103 1104 // the next two lines should resolve to a single vswp d, d 1105 float32x4_t xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v1)); 1106 float32x4_t xy1 = vcombine_f32( vget_low_f32(v2), vget_low_f32(v3)); 1107 // the next two lines should resolve to a single vswp d, d 1108 float32x4_t z0 = vcombine_f32( vget_high_f32(v0), vget_high_f32(v1)); 1109 float32x4_t z1 = vcombine_f32( vget_high_f32(v2), vget_high_f32(v3)); 1110 1111 xy0 = vmulq_f32(xy0, vLo); 1112 xy1 = vmulq_f32(xy1, vLo); 1113 1114 float32x4x2_t zb = vuzpq_f32( z0, z1); 1115 float32x4_t z = vmulq_f32( zb.val[0], vHi); 1116 float32x4x2_t xy = vuzpq_f32( xy0, xy1); 1117 float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]); 1118 x = vaddq_f32(x, z); 1119 1120 uint32x4_t mask = vcgtq_f32(x, maxDot); 1121 maxDot = vbslq_f32( mask, x, maxDot); 1122 index = vbslq_u32(mask, local_index, index); 1123 local_index = vaddq_u32(local_index, four); 1124 1125 v0 = vld1q_f32_aligned_postincrement( vv ); 1126 v1 = vld1q_f32_aligned_postincrement( vv ); 1127 v2 = vld1q_f32_aligned_postincrement( vv ); 1128 v3 = vld1q_f32_aligned_postincrement( vv ); 1129 1130 // the next two lines should resolve to a single vswp d, d 1131 xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v1)); 1132 xy1 = vcombine_f32( vget_low_f32(v2), vget_low_f32(v3)); 1133 // the next two lines should resolve to a single vswp d, d 1134 z0 = vcombine_f32( vget_high_f32(v0), vget_high_f32(v1)); 1135 z1 = vcombine_f32( vget_high_f32(v2), vget_high_f32(v3)); 1136 1137 xy0 = vmulq_f32(xy0, vLo); 1138 xy1 = vmulq_f32(xy1, vLo); 1139 1140 zb = vuzpq_f32( z0, z1); 1141 z = vmulq_f32( zb.val[0], vHi); 1142 xy = vuzpq_f32( xy0, xy1); 1143 x = vaddq_f32(xy.val[0], xy.val[1]); 1144 x = vaddq_f32(x, z); 1145 1146 mask = vcgtq_f32(x, maxDot); 1147 maxDot = vbslq_f32( mask, x, maxDot); 1148 index = vbslq_u32(mask, local_index, index); 1149 local_index = vaddq_u32(local_index, four); 1150 } 1151 1152 for( ; i + 4 <= count; i += 4 ) 1153 { 1154 float32x4_t v0 = vld1q_f32_aligned_postincrement( vv ); 1155 float32x4_t v1 = vld1q_f32_aligned_postincrement( vv ); 1156 float32x4_t v2 = vld1q_f32_aligned_postincrement( vv ); 1157 float32x4_t v3 = vld1q_f32_aligned_postincrement( vv ); 1158 1159 // the next two lines should resolve to a single vswp d, d 1160 float32x4_t xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v1)); 1161 float32x4_t xy1 = vcombine_f32( vget_low_f32(v2), vget_low_f32(v3)); 1162 // the next two lines should resolve to a single vswp d, d 1163 float32x4_t z0 = vcombine_f32( vget_high_f32(v0), vget_high_f32(v1)); 1164 float32x4_t z1 = vcombine_f32( vget_high_f32(v2), vget_high_f32(v3)); 1165 1166 xy0 = vmulq_f32(xy0, vLo); 1167 xy1 = vmulq_f32(xy1, vLo); 1168 1169 float32x4x2_t zb = vuzpq_f32( z0, z1); 1170 float32x4_t z = vmulq_f32( zb.val[0], vHi); 1171 float32x4x2_t xy = vuzpq_f32( xy0, xy1); 1172 float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]); 1173 x = vaddq_f32(x, z); 1174 1175 uint32x4_t mask = vcgtq_f32(x, maxDot); 1176 maxDot = vbslq_f32( mask, x, maxDot); 1177 index = vbslq_u32(mask, local_index, index); 1178 local_index = vaddq_u32(local_index, four); 1179 } 1180 1181 switch (count & 3) { 1182 case 3: 1183 { 1184 float32x4_t v0 = vld1q_f32_aligned_postincrement( vv ); 1185 float32x4_t v1 = vld1q_f32_aligned_postincrement( vv ); 1186 float32x4_t v2 = vld1q_f32_aligned_postincrement( vv ); 1187 1188 // the next two lines should resolve to a single vswp d, d 1189 float32x4_t xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v1)); 1190 float32x4_t xy1 = vcombine_f32( vget_low_f32(v2), vget_low_f32(v2)); 1191 // the next two lines should resolve to a single vswp d, d 1192 float32x4_t z0 = vcombine_f32( vget_high_f32(v0), vget_high_f32(v1)); 1193 float32x4_t z1 = vcombine_f32( vget_high_f32(v2), vget_high_f32(v2)); 1194 1195 xy0 = vmulq_f32(xy0, vLo); 1196 xy1 = vmulq_f32(xy1, vLo); 1197 1198 float32x4x2_t zb = vuzpq_f32( z0, z1); 1199 float32x4_t z = vmulq_f32( zb.val[0], vHi); 1200 float32x4x2_t xy = vuzpq_f32( xy0, xy1); 1201 float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]); 1202 x = vaddq_f32(x, z); 1203 1204 uint32x4_t mask = vcgtq_f32(x, maxDot); 1205 maxDot = vbslq_f32( mask, x, maxDot); 1206 index = vbslq_u32(mask, local_index, index); 1207 local_index = vaddq_u32(local_index, four); 1208 } 1209 break; 1210 1211 case 2: 1212 { 1213 float32x4_t v0 = vld1q_f32_aligned_postincrement( vv ); 1214 float32x4_t v1 = vld1q_f32_aligned_postincrement( vv ); 1215 1216 // the next two lines should resolve to a single vswp d, d 1217 float32x4_t xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v1)); 1218 // the next two lines should resolve to a single vswp d, d 1219 float32x4_t z0 = vcombine_f32( vget_high_f32(v0), vget_high_f32(v1)); 1220 1221 xy0 = vmulq_f32(xy0, vLo); 1222 1223 float32x4x2_t zb = vuzpq_f32( z0, z0); 1224 float32x4_t z = vmulq_f32( zb.val[0], vHi); 1225 float32x4x2_t xy = vuzpq_f32( xy0, xy0); 1226 float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]); 1227 x = vaddq_f32(x, z); 1228 1229 uint32x4_t mask = vcgtq_f32(x, maxDot); 1230 maxDot = vbslq_f32( mask, x, maxDot); 1231 index = vbslq_u32(mask, local_index, index); 1232 local_index = vaddq_u32(local_index, four); 1233 } 1234 break; 1235 1236 case 1: 1237 { 1238 float32x4_t v0 = vld1q_f32_aligned_postincrement( vv ); 1239 1240 // the next two lines should resolve to a single vswp d, d 1241 float32x4_t xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v0)); 1242 // the next two lines should resolve to a single vswp d, d 1243 float32x4_t z = vdupq_lane_f32(vget_high_f32(v0), 0); 1244 1245 xy0 = vmulq_f32(xy0, vLo); 1246 1247 z = vmulq_f32( z, vHi); 1248 float32x4x2_t xy = vuzpq_f32( xy0, xy0); 1249 float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]); 1250 x = vaddq_f32(x, z); 1251 1252 uint32x4_t mask = vcgtq_f32(x, maxDot); 1253 maxDot = vbslq_f32( mask, x, maxDot); 1254 index = vbslq_u32(mask, local_index, index); 1255 local_index = vaddq_u32(local_index, four); 1256 } 1257 break; 1258 1259 default: 1260 break; 1261 } 1262 1263 1264 // select best answer between hi and lo results 1265 uint32x2_t mask = vcgt_f32( vget_high_f32(maxDot), vget_low_f32(maxDot)); 1266 float32x2_t maxDot2 = vbsl_f32(mask, vget_high_f32(maxDot), vget_low_f32(maxDot)); 1267 uint32x2_t index2 = vbsl_u32(mask, vget_high_u32(index), vget_low_u32(index)); 1268 1269 // select best answer between even and odd results 1270 float32x2_t maxDotO = vdup_lane_f32(maxDot2, 1); 1271 uint32x2_t indexHi = vdup_lane_u32(index2, 1); 1272 mask = vcgt_f32( maxDotO, maxDot2 ); 1273 maxDot2 = vbsl_f32(mask, maxDotO, maxDot2); 1274 index2 = vbsl_u32(mask, indexHi, index2); 1275 1276 *dotResult = vget_lane_f32( maxDot2, 0); 1277 return vget_lane_u32(index2, 0); 1278 1279 } 1280 1281 long _mindot_large_v0( const float *vv, const float *vec, unsigned long count, float *dotResult ) 1282 { 1283 unsigned long i = 0; 1284 float32x4_t vvec = vld1q_f32_aligned_postincrement( vec ); 1285 float32x2_t vLo = vget_low_f32(vvec); 1286 float32x2_t vHi = vdup_lane_f32(vget_high_f32(vvec), 0); 1287 float32x2_t dotMinLo = (float32x2_t) { BT_INFINITY, BT_INFINITY }; 1288 float32x2_t dotMinHi = (float32x2_t) { BT_INFINITY, BT_INFINITY }; 1289 uint32x2_t indexLo = (uint32x2_t) {0, 1}; 1290 uint32x2_t indexHi = (uint32x2_t) {2, 3}; 1291 uint32x2_t iLo = (uint32x2_t) {static_cast<uint32_t>(-1), static_cast<uint32_t>(-1)}; 1292 uint32x2_t iHi = (uint32x2_t) {static_cast<uint32_t>(-1), static_cast<uint32_t>(-1)}; 1293 const uint32x2_t four = (uint32x2_t) {4,4}; 1294 1295 for( ; i+8 <= count; i+= 8 ) 1296 { 1297 float32x4_t v0 = vld1q_f32_aligned_postincrement( vv ); 1298 float32x4_t v1 = vld1q_f32_aligned_postincrement( vv ); 1299 float32x4_t v2 = vld1q_f32_aligned_postincrement( vv ); 1300 float32x4_t v3 = vld1q_f32_aligned_postincrement( vv ); 1301 1302 float32x2_t xy0 = vmul_f32( vget_low_f32(v0), vLo); 1303 float32x2_t xy1 = vmul_f32( vget_low_f32(v1), vLo); 1304 float32x2_t xy2 = vmul_f32( vget_low_f32(v2), vLo); 1305 float32x2_t xy3 = vmul_f32( vget_low_f32(v3), vLo); 1306 1307 float32x2x2_t z0 = vtrn_f32( vget_high_f32(v0), vget_high_f32(v1)); 1308 float32x2x2_t z1 = vtrn_f32( vget_high_f32(v2), vget_high_f32(v3)); 1309 float32x2_t zLo = vmul_f32( z0.val[0], vHi); 1310 float32x2_t zHi = vmul_f32( z1.val[0], vHi); 1311 1312 float32x2_t rLo = vpadd_f32( xy0, xy1); 1313 float32x2_t rHi = vpadd_f32( xy2, xy3); 1314 rLo = vadd_f32(rLo, zLo); 1315 rHi = vadd_f32(rHi, zHi); 1316 1317 uint32x2_t maskLo = vclt_f32( rLo, dotMinLo ); 1318 uint32x2_t maskHi = vclt_f32( rHi, dotMinHi ); 1319 dotMinLo = vbsl_f32( maskLo, rLo, dotMinLo); 1320 dotMinHi = vbsl_f32( maskHi, rHi, dotMinHi); 1321 iLo = vbsl_u32(maskLo, indexLo, iLo); 1322 iHi = vbsl_u32(maskHi, indexHi, iHi); 1323 indexLo = vadd_u32(indexLo, four); 1324 indexHi = vadd_u32(indexHi, four); 1325 1326 v0 = vld1q_f32_aligned_postincrement( vv ); 1327 v1 = vld1q_f32_aligned_postincrement( vv ); 1328 v2 = vld1q_f32_aligned_postincrement( vv ); 1329 v3 = vld1q_f32_aligned_postincrement( vv ); 1330 1331 xy0 = vmul_f32( vget_low_f32(v0), vLo); 1332 xy1 = vmul_f32( vget_low_f32(v1), vLo); 1333 xy2 = vmul_f32( vget_low_f32(v2), vLo); 1334 xy3 = vmul_f32( vget_low_f32(v3), vLo); 1335 1336 z0 = vtrn_f32( vget_high_f32(v0), vget_high_f32(v1)); 1337 z1 = vtrn_f32( vget_high_f32(v2), vget_high_f32(v3)); 1338 zLo = vmul_f32( z0.val[0], vHi); 1339 zHi = vmul_f32( z1.val[0], vHi); 1340 1341 rLo = vpadd_f32( xy0, xy1); 1342 rHi = vpadd_f32( xy2, xy3); 1343 rLo = vadd_f32(rLo, zLo); 1344 rHi = vadd_f32(rHi, zHi); 1345 1346 maskLo = vclt_f32( rLo, dotMinLo ); 1347 maskHi = vclt_f32( rHi, dotMinHi ); 1348 dotMinLo = vbsl_f32( maskLo, rLo, dotMinLo); 1349 dotMinHi = vbsl_f32( maskHi, rHi, dotMinHi); 1350 iLo = vbsl_u32(maskLo, indexLo, iLo); 1351 iHi = vbsl_u32(maskHi, indexHi, iHi); 1352 indexLo = vadd_u32(indexLo, four); 1353 indexHi = vadd_u32(indexHi, four); 1354 } 1355 1356 for( ; i+4 <= count; i+= 4 ) 1357 { 1358 float32x4_t v0 = vld1q_f32_aligned_postincrement( vv ); 1359 float32x4_t v1 = vld1q_f32_aligned_postincrement( vv ); 1360 float32x4_t v2 = vld1q_f32_aligned_postincrement( vv ); 1361 float32x4_t v3 = vld1q_f32_aligned_postincrement( vv ); 1362 1363 float32x2_t xy0 = vmul_f32( vget_low_f32(v0), vLo); 1364 float32x2_t xy1 = vmul_f32( vget_low_f32(v1), vLo); 1365 float32x2_t xy2 = vmul_f32( vget_low_f32(v2), vLo); 1366 float32x2_t xy3 = vmul_f32( vget_low_f32(v3), vLo); 1367 1368 float32x2x2_t z0 = vtrn_f32( vget_high_f32(v0), vget_high_f32(v1)); 1369 float32x2x2_t z1 = vtrn_f32( vget_high_f32(v2), vget_high_f32(v3)); 1370 float32x2_t zLo = vmul_f32( z0.val[0], vHi); 1371 float32x2_t zHi = vmul_f32( z1.val[0], vHi); 1372 1373 float32x2_t rLo = vpadd_f32( xy0, xy1); 1374 float32x2_t rHi = vpadd_f32( xy2, xy3); 1375 rLo = vadd_f32(rLo, zLo); 1376 rHi = vadd_f32(rHi, zHi); 1377 1378 uint32x2_t maskLo = vclt_f32( rLo, dotMinLo ); 1379 uint32x2_t maskHi = vclt_f32( rHi, dotMinHi ); 1380 dotMinLo = vbsl_f32( maskLo, rLo, dotMinLo); 1381 dotMinHi = vbsl_f32( maskHi, rHi, dotMinHi); 1382 iLo = vbsl_u32(maskLo, indexLo, iLo); 1383 iHi = vbsl_u32(maskHi, indexHi, iHi); 1384 indexLo = vadd_u32(indexLo, four); 1385 indexHi = vadd_u32(indexHi, four); 1386 } 1387 switch( count & 3 ) 1388 { 1389 case 3: 1390 { 1391 float32x4_t v0 = vld1q_f32_aligned_postincrement( vv ); 1392 float32x4_t v1 = vld1q_f32_aligned_postincrement( vv ); 1393 float32x4_t v2 = vld1q_f32_aligned_postincrement( vv ); 1394 1395 float32x2_t xy0 = vmul_f32( vget_low_f32(v0), vLo); 1396 float32x2_t xy1 = vmul_f32( vget_low_f32(v1), vLo); 1397 float32x2_t xy2 = vmul_f32( vget_low_f32(v2), vLo); 1398 1399 float32x2x2_t z0 = vtrn_f32( vget_high_f32(v0), vget_high_f32(v1)); 1400 float32x2_t zLo = vmul_f32( z0.val[0], vHi); 1401 float32x2_t zHi = vmul_f32( vdup_lane_f32(vget_high_f32(v2), 0), vHi); 1402 1403 float32x2_t rLo = vpadd_f32( xy0, xy1); 1404 float32x2_t rHi = vpadd_f32( xy2, xy2); 1405 rLo = vadd_f32(rLo, zLo); 1406 rHi = vadd_f32(rHi, zHi); 1407 1408 uint32x2_t maskLo = vclt_f32( rLo, dotMinLo ); 1409 uint32x2_t maskHi = vclt_f32( rHi, dotMinHi ); 1410 dotMinLo = vbsl_f32( maskLo, rLo, dotMinLo); 1411 dotMinHi = vbsl_f32( maskHi, rHi, dotMinHi); 1412 iLo = vbsl_u32(maskLo, indexLo, iLo); 1413 iHi = vbsl_u32(maskHi, indexHi, iHi); 1414 } 1415 break; 1416 case 2: 1417 { 1418 float32x4_t v0 = vld1q_f32_aligned_postincrement( vv ); 1419 float32x4_t v1 = vld1q_f32_aligned_postincrement( vv ); 1420 1421 float32x2_t xy0 = vmul_f32( vget_low_f32(v0), vLo); 1422 float32x2_t xy1 = vmul_f32( vget_low_f32(v1), vLo); 1423 1424 float32x2x2_t z0 = vtrn_f32( vget_high_f32(v0), vget_high_f32(v1)); 1425 float32x2_t zLo = vmul_f32( z0.val[0], vHi); 1426 1427 float32x2_t rLo = vpadd_f32( xy0, xy1); 1428 rLo = vadd_f32(rLo, zLo); 1429 1430 uint32x2_t maskLo = vclt_f32( rLo, dotMinLo ); 1431 dotMinLo = vbsl_f32( maskLo, rLo, dotMinLo); 1432 iLo = vbsl_u32(maskLo, indexLo, iLo); 1433 } 1434 break; 1435 case 1: 1436 { 1437 float32x4_t v0 = vld1q_f32_aligned_postincrement( vv ); 1438 float32x2_t xy0 = vmul_f32( vget_low_f32(v0), vLo); 1439 float32x2_t z0 = vdup_lane_f32(vget_high_f32(v0), 0); 1440 float32x2_t zLo = vmul_f32( z0, vHi); 1441 float32x2_t rLo = vpadd_f32( xy0, xy0); 1442 rLo = vadd_f32(rLo, zLo); 1443 uint32x2_t maskLo = vclt_f32( rLo, dotMinLo ); 1444 dotMinLo = vbsl_f32( maskLo, rLo, dotMinLo); 1445 iLo = vbsl_u32(maskLo, indexLo, iLo); 1446 } 1447 break; 1448 1449 default: 1450 break; 1451 } 1452 1453 // select best answer between hi and lo results 1454 uint32x2_t mask = vclt_f32( dotMinHi, dotMinLo ); 1455 dotMinLo = vbsl_f32(mask, dotMinHi, dotMinLo); 1456 iLo = vbsl_u32(mask, iHi, iLo); 1457 1458 // select best answer between even and odd results 1459 dotMinHi = vdup_lane_f32(dotMinLo, 1); 1460 iHi = vdup_lane_u32(iLo, 1); 1461 mask = vclt_f32( dotMinHi, dotMinLo ); 1462 dotMinLo = vbsl_f32(mask, dotMinHi, dotMinLo); 1463 iLo = vbsl_u32(mask, iHi, iLo); 1464 1465 *dotResult = vget_lane_f32( dotMinLo, 0); 1466 return vget_lane_u32(iLo, 0); 1467 } 1468 1469 long _mindot_large_v1( const float *vv, const float *vec, unsigned long count, float *dotResult ) 1470 { 1471 float32x4_t vvec = vld1q_f32_aligned_postincrement( vec ); 1472 float32x4_t vLo = vcombine_f32(vget_low_f32(vvec), vget_low_f32(vvec)); 1473 float32x4_t vHi = vdupq_lane_f32(vget_high_f32(vvec), 0); 1474 const uint32x4_t four = (uint32x4_t){ 4, 4, 4, 4 }; 1475 uint32x4_t local_index = (uint32x4_t) {0, 1, 2, 3}; 1476 uint32x4_t index = (uint32x4_t) { static_cast<uint32_t>(-1), static_cast<uint32_t>(-1), static_cast<uint32_t>(-1), static_cast<uint32_t>(-1) }; 1477 float32x4_t minDot = (float32x4_t) { BT_INFINITY, BT_INFINITY, BT_INFINITY, BT_INFINITY }; 1478 1479 unsigned long i = 0; 1480 for( ; i + 8 <= count; i += 8 ) 1481 { 1482 float32x4_t v0 = vld1q_f32_aligned_postincrement( vv ); 1483 float32x4_t v1 = vld1q_f32_aligned_postincrement( vv ); 1484 float32x4_t v2 = vld1q_f32_aligned_postincrement( vv ); 1485 float32x4_t v3 = vld1q_f32_aligned_postincrement( vv ); 1486 1487 // the next two lines should resolve to a single vswp d, d 1488 float32x4_t xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v1)); 1489 float32x4_t xy1 = vcombine_f32( vget_low_f32(v2), vget_low_f32(v3)); 1490 // the next two lines should resolve to a single vswp d, d 1491 float32x4_t z0 = vcombine_f32( vget_high_f32(v0), vget_high_f32(v1)); 1492 float32x4_t z1 = vcombine_f32( vget_high_f32(v2), vget_high_f32(v3)); 1493 1494 xy0 = vmulq_f32(xy0, vLo); 1495 xy1 = vmulq_f32(xy1, vLo); 1496 1497 float32x4x2_t zb = vuzpq_f32( z0, z1); 1498 float32x4_t z = vmulq_f32( zb.val[0], vHi); 1499 float32x4x2_t xy = vuzpq_f32( xy0, xy1); 1500 float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]); 1501 x = vaddq_f32(x, z); 1502 1503 uint32x4_t mask = vcltq_f32(x, minDot); 1504 minDot = vbslq_f32( mask, x, minDot); 1505 index = vbslq_u32(mask, local_index, index); 1506 local_index = vaddq_u32(local_index, four); 1507 1508 v0 = vld1q_f32_aligned_postincrement( vv ); 1509 v1 = vld1q_f32_aligned_postincrement( vv ); 1510 v2 = vld1q_f32_aligned_postincrement( vv ); 1511 v3 = vld1q_f32_aligned_postincrement( vv ); 1512 1513 // the next two lines should resolve to a single vswp d, d 1514 xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v1)); 1515 xy1 = vcombine_f32( vget_low_f32(v2), vget_low_f32(v3)); 1516 // the next two lines should resolve to a single vswp d, d 1517 z0 = vcombine_f32( vget_high_f32(v0), vget_high_f32(v1)); 1518 z1 = vcombine_f32( vget_high_f32(v2), vget_high_f32(v3)); 1519 1520 xy0 = vmulq_f32(xy0, vLo); 1521 xy1 = vmulq_f32(xy1, vLo); 1522 1523 zb = vuzpq_f32( z0, z1); 1524 z = vmulq_f32( zb.val[0], vHi); 1525 xy = vuzpq_f32( xy0, xy1); 1526 x = vaddq_f32(xy.val[0], xy.val[1]); 1527 x = vaddq_f32(x, z); 1528 1529 mask = vcltq_f32(x, minDot); 1530 minDot = vbslq_f32( mask, x, minDot); 1531 index = vbslq_u32(mask, local_index, index); 1532 local_index = vaddq_u32(local_index, four); 1533 } 1534 1535 for( ; i + 4 <= count; i += 4 ) 1536 { 1537 float32x4_t v0 = vld1q_f32_aligned_postincrement( vv ); 1538 float32x4_t v1 = vld1q_f32_aligned_postincrement( vv ); 1539 float32x4_t v2 = vld1q_f32_aligned_postincrement( vv ); 1540 float32x4_t v3 = vld1q_f32_aligned_postincrement( vv ); 1541 1542 // the next two lines should resolve to a single vswp d, d 1543 float32x4_t xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v1)); 1544 float32x4_t xy1 = vcombine_f32( vget_low_f32(v2), vget_low_f32(v3)); 1545 // the next two lines should resolve to a single vswp d, d 1546 float32x4_t z0 = vcombine_f32( vget_high_f32(v0), vget_high_f32(v1)); 1547 float32x4_t z1 = vcombine_f32( vget_high_f32(v2), vget_high_f32(v3)); 1548 1549 xy0 = vmulq_f32(xy0, vLo); 1550 xy1 = vmulq_f32(xy1, vLo); 1551 1552 float32x4x2_t zb = vuzpq_f32( z0, z1); 1553 float32x4_t z = vmulq_f32( zb.val[0], vHi); 1554 float32x4x2_t xy = vuzpq_f32( xy0, xy1); 1555 float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]); 1556 x = vaddq_f32(x, z); 1557 1558 uint32x4_t mask = vcltq_f32(x, minDot); 1559 minDot = vbslq_f32( mask, x, minDot); 1560 index = vbslq_u32(mask, local_index, index); 1561 local_index = vaddq_u32(local_index, four); 1562 } 1563 1564 switch (count & 3) { 1565 case 3: 1566 { 1567 float32x4_t v0 = vld1q_f32_aligned_postincrement( vv ); 1568 float32x4_t v1 = vld1q_f32_aligned_postincrement( vv ); 1569 float32x4_t v2 = vld1q_f32_aligned_postincrement( vv ); 1570 1571 // the next two lines should resolve to a single vswp d, d 1572 float32x4_t xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v1)); 1573 float32x4_t xy1 = vcombine_f32( vget_low_f32(v2), vget_low_f32(v2)); 1574 // the next two lines should resolve to a single vswp d, d 1575 float32x4_t z0 = vcombine_f32( vget_high_f32(v0), vget_high_f32(v1)); 1576 float32x4_t z1 = vcombine_f32( vget_high_f32(v2), vget_high_f32(v2)); 1577 1578 xy0 = vmulq_f32(xy0, vLo); 1579 xy1 = vmulq_f32(xy1, vLo); 1580 1581 float32x4x2_t zb = vuzpq_f32( z0, z1); 1582 float32x4_t z = vmulq_f32( zb.val[0], vHi); 1583 float32x4x2_t xy = vuzpq_f32( xy0, xy1); 1584 float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]); 1585 x = vaddq_f32(x, z); 1586 1587 uint32x4_t mask = vcltq_f32(x, minDot); 1588 minDot = vbslq_f32( mask, x, minDot); 1589 index = vbslq_u32(mask, local_index, index); 1590 local_index = vaddq_u32(local_index, four); 1591 } 1592 break; 1593 1594 case 2: 1595 { 1596 float32x4_t v0 = vld1q_f32_aligned_postincrement( vv ); 1597 float32x4_t v1 = vld1q_f32_aligned_postincrement( vv ); 1598 1599 // the next two lines should resolve to a single vswp d, d 1600 float32x4_t xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v1)); 1601 // the next two lines should resolve to a single vswp d, d 1602 float32x4_t z0 = vcombine_f32( vget_high_f32(v0), vget_high_f32(v1)); 1603 1604 xy0 = vmulq_f32(xy0, vLo); 1605 1606 float32x4x2_t zb = vuzpq_f32( z0, z0); 1607 float32x4_t z = vmulq_f32( zb.val[0], vHi); 1608 float32x4x2_t xy = vuzpq_f32( xy0, xy0); 1609 float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]); 1610 x = vaddq_f32(x, z); 1611 1612 uint32x4_t mask = vcltq_f32(x, minDot); 1613 minDot = vbslq_f32( mask, x, minDot); 1614 index = vbslq_u32(mask, local_index, index); 1615 local_index = vaddq_u32(local_index, four); 1616 } 1617 break; 1618 1619 case 1: 1620 { 1621 float32x4_t v0 = vld1q_f32_aligned_postincrement( vv ); 1622 1623 // the next two lines should resolve to a single vswp d, d 1624 float32x4_t xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v0)); 1625 // the next two lines should resolve to a single vswp d, d 1626 float32x4_t z = vdupq_lane_f32(vget_high_f32(v0), 0); 1627 1628 xy0 = vmulq_f32(xy0, vLo); 1629 1630 z = vmulq_f32( z, vHi); 1631 float32x4x2_t xy = vuzpq_f32( xy0, xy0); 1632 float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]); 1633 x = vaddq_f32(x, z); 1634 1635 uint32x4_t mask = vcltq_f32(x, minDot); 1636 minDot = vbslq_f32( mask, x, minDot); 1637 index = vbslq_u32(mask, local_index, index); 1638 local_index = vaddq_u32(local_index, four); 1639 } 1640 break; 1641 1642 default: 1643 break; 1644 } 1645 1646 1647 // select best answer between hi and lo results 1648 uint32x2_t mask = vclt_f32( vget_high_f32(minDot), vget_low_f32(minDot)); 1649 float32x2_t minDot2 = vbsl_f32(mask, vget_high_f32(minDot), vget_low_f32(minDot)); 1650 uint32x2_t index2 = vbsl_u32(mask, vget_high_u32(index), vget_low_u32(index)); 1651 1652 // select best answer between even and odd results 1653 float32x2_t minDotO = vdup_lane_f32(minDot2, 1); 1654 uint32x2_t indexHi = vdup_lane_u32(index2, 1); 1655 mask = vclt_f32( minDotO, minDot2 ); 1656 minDot2 = vbsl_f32(mask, minDotO, minDot2); 1657 index2 = vbsl_u32(mask, indexHi, index2); 1658 1659 *dotResult = vget_lane_f32( minDot2, 0); 1660 return vget_lane_u32(index2, 0); 1661 1662 } 1663 1664 #else 1665 #error Unhandled __APPLE__ arch 1666 #endif 1667 1668 #endif /* __APPLE__ */ 1669 1670 1671