1 /* 2 * Copyright (C) 2008 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 /* ---- includes ----------------------------------------------------------- */ 18 19 #include "b_BasicEm/Math.h" 20 #include "b_BasicEm/Functions.h" 21 22 /* ---- related objects --------------------------------------------------- */ 23 24 /* ---- typedefs ----------------------------------------------------------- */ 25 26 /* ---- constants ---------------------------------------------------------- */ 27 28 /* ------------------------------------------------------------------------- */ 29 30 /* ========================================================================= */ 31 /* */ 32 /* ---- \ghd{ external functions } ----------------------------------------- */ 33 /* */ 34 /* ========================================================================= */ 35 36 #if defined( HW_SSE2 ) 37 extern int32 bbs_dotProduct_128SSE2( const int16* vec1A, const int16* vec2A, uint32 sizeA ); 38 extern int32 bbs_dotProduct_u128SSE2( const int16* vec1A, const int16* vec2A, uint32 sizeA ); 39 #endif 40 41 #if defined( HW_FR71 ) 42 int32 bbs_dotProduct_fr71( const int16* vec1A, const int16* vec2A, uint32 sizeA ); 43 #endif 44 45 /* ------------------------------------------------------------------------- */ 46 47 uint16 bbs_sqrt32( uint32 valA ) 48 { 49 uint32 rootL = 0; 50 uint32 expL = 0; 51 expL += ( ( valA >> ( expL + 0x10 ) ) != 0 ) << 4; 52 expL += ( ( valA >> ( expL + 0x08 ) ) != 0 ) << 3; 53 expL += ( ( valA >> ( expL + 0x04 ) ) != 0 ) << 2; 54 expL += ( ( valA >> ( expL + 0x02 ) ) != 0 ) << 1; 55 switch( expL >> 1 ) 56 { 57 case 15: rootL += ( ( rootL + 0x8000 ) * ( rootL + 0x8000 ) <= valA ) << 15; 58 case 14: rootL += ( ( rootL + 0x4000 ) * ( rootL + 0x4000 ) <= valA ) << 14; 59 case 13: rootL += ( ( rootL + 0x2000 ) * ( rootL + 0x2000 ) <= valA ) << 13; 60 case 12: rootL += ( ( rootL + 0x1000 ) * ( rootL + 0x1000 ) <= valA ) << 12; 61 case 11: rootL += ( ( rootL + 0x0800 ) * ( rootL + 0x0800 ) <= valA ) << 11; 62 case 10: rootL += ( ( rootL + 0x0400 ) * ( rootL + 0x0400 ) <= valA ) << 10; 63 case 9: rootL += ( ( rootL + 0x0200 ) * ( rootL + 0x0200 ) <= valA ) << 9; 64 case 8: rootL += ( ( rootL + 0x0100 ) * ( rootL + 0x0100 ) <= valA ) << 8; 65 case 7: rootL += ( ( rootL + 0x0080 ) * ( rootL + 0x0080 ) <= valA ) << 7; 66 case 6: rootL += ( ( rootL + 0x0040 ) * ( rootL + 0x0040 ) <= valA ) << 6; 67 case 5: rootL += ( ( rootL + 0x0020 ) * ( rootL + 0x0020 ) <= valA ) << 5; 68 case 4: rootL += ( ( rootL + 0x0010 ) * ( rootL + 0x0010 ) <= valA ) << 4; 69 case 3: rootL += ( ( rootL + 0x0008 ) * ( rootL + 0x0008 ) <= valA ) << 3; 70 case 2: rootL += ( ( rootL + 0x0004 ) * ( rootL + 0x0004 ) <= valA ) << 2; 71 case 1: rootL += ( ( rootL + 0x0002 ) * ( rootL + 0x0002 ) <= valA ) << 1; 72 case 0: rootL += ( ( rootL + 0x0001 ) * ( rootL + 0x0001 ) <= valA ) << 0; 73 } 74 75 return ( uint16 )rootL; 76 } 77 78 /* ------------------------------------------------------------------------- */ 79 80 uint8 bbs_sqrt16( uint16 valA ) 81 { 82 uint16 rootL = 0; 83 rootL += ( ( rootL + 0x80 ) * ( rootL + 0x80 ) <= valA ) << 7; 84 rootL += ( ( rootL + 0x40 ) * ( rootL + 0x40 ) <= valA ) << 6; 85 rootL += ( ( rootL + 0x20 ) * ( rootL + 0x20 ) <= valA ) << 5; 86 rootL += ( ( rootL + 0x10 ) * ( rootL + 0x10 ) <= valA ) << 4; 87 rootL += ( ( rootL + 0x08 ) * ( rootL + 0x08 ) <= valA ) << 3; 88 rootL += ( ( rootL + 0x04 ) * ( rootL + 0x04 ) <= valA ) << 2; 89 rootL += ( ( rootL + 0x02 ) * ( rootL + 0x02 ) <= valA ) << 1; 90 rootL += ( ( rootL + 0x01 ) * ( rootL + 0x01 ) <= valA ) << 0; 91 return ( uint8 )rootL; 92 } 93 94 /* ------------------------------------------------------------------------- */ 95 96 /* table of sqrt and slope values */ 97 const uint32 bbs_fastSqrt32_tableG[] = 98 { 99 268435456, 1016, 272596992, 1000, 276692992, 987, 280735744, 972, 100 284717056, 959, 288645120, 946, 292519936, 933, 296341504, 922, 101 300118016, 910, 303845376, 899, 307527680, 889, 311169024, 878, 102 314765312, 869, 318324736, 858, 321839104, 850, 325320704, 840, 103 328761344, 832, 332169216, 824, 335544320, 815, 338882560, 807, 104 342188032, 799, 345460736, 792, 348704768, 785, 351920128, 777, 105 355102720, 771, 358260736, 764, 361390080, 757, 364490752, 751, 106 367566848, 745, 370618368, 739, 373645312, 732, 376643584, 727, 107 379621376, 722, 382578688, 715, 385507328, 711, 388419584, 705, 108 391307264, 700, 394174464, 695, 397021184, 689, 399843328, 686, 109 402653184, 680, 405438464, 675, 408203264, 672, 410955776, 666, 110 413683712, 663, 416399360, 658, 419094528, 653, 421769216, 650, 111 424431616, 646, 427077632, 641, 429703168, 638, 432316416, 634, 112 434913280, 630, 437493760, 627, 440061952, 622, 442609664, 620, 113 445149184, 615, 447668224, 613, 450179072, 609, 452673536, 605, 114 455151616, 602, 457617408, 600, 460075008, 595, 462512128, 593, 115 464941056, 590, 467357696, 587, 469762048, 583, 472150016, 581, 116 474529792, 578, 476897280, 575, 479252480, 572, 481595392, 569, 117 483926016, 567, 486248448, 564, 488558592, 561, 490856448, 559, 118 493146112, 556, 495423488, 553, 497688576, 552, 499949568, 548, 119 502194176, 546, 504430592, 544, 506658816, 541, 508874752, 539, 120 511082496, 537, 513282048, 534, 515469312, 533, 517652480, 529, 121 519819264, 528, 521981952, 526, 524136448, 523, 526278656, 521, 122 528412672, 519, 530538496, 517, 532656128, 515, 534765568, 514 123 }; 124 125 uint16 bbs_fastSqrt32( uint32 valA ) 126 { 127 uint32 expL = 0; 128 uint32 valL; 129 uint32 offsL; 130 uint32 indexL = 0; 131 132 if( valA == 0 ) return 0; 133 134 /* compute closest even size exponent of valA */ 135 expL += ( ( valA >> ( expL + 0x10 ) ) != 0 ) << 4; 136 expL += ( ( valA >> ( expL + 0x08 ) ) != 0 ) << 3; 137 expL += ( ( valA >> ( expL + 0x04 ) ) != 0 ) << 2; 138 expL += ( ( valA >> ( expL + 0x02 ) ) != 0 ) << 1; 139 140 valL = ( ( valA << ( 30 - expL ) ) - 1073741824 ); /* ( 1 << 30 ) */ 141 offsL = ( ( valL & 0x01FFFFFF ) + ( 1 << 12 ) ) >> 13; 142 indexL = ( valL >> 24 ) & 0xFE; 143 144 return ( bbs_fastSqrt32_tableG[ indexL ] + offsL * bbs_fastSqrt32_tableG[ indexL + 1 ] ) >> ( 28 - ( expL >> 1 ) ); 145 } 146 147 /* ------------------------------------------------------------------------- */ 148 149 /* table of 1/sqrt (1.31) and negative slope (.15) values 150 referenced in b_GaborCueEm/focusDispAsm.s55, do not rename or remove ! */ 151 const uint32 bbs_invSqrt32_tableG[] = 152 { 153 2147483648u, 1001, 2114682880, 956, 2083356672, 915, 2053373952, 877, 154 2024636416, 840, 1997111296, 808, 1970634752, 776, 1945206784, 746, 155 1920761856, 720, 1897168896, 693, 1874460672, 669, 1852538880, 646, 156 1831370752, 625, 1810890752, 604, 1791098880, 584, 1771962368, 567, 157 1753382912, 548, 1735426048, 533, 1717960704, 516, 1701052416, 502, 158 1684602880, 487, 1668644864, 474, 1653112832, 461, 1638006784, 448, 159 1623326720, 436, 1609039872, 426, 1595080704, 414, 1581514752, 404, 160 1568276480, 394, 1555365888, 384, 1542782976, 375, 1530494976, 367, 161 1518469120, 357, 1506770944, 350, 1495302144, 342, 1484095488, 334, 162 1473150976, 327, 1462435840, 320, 1451950080, 313, 1441693696, 307, 163 1431633920, 300, 1421803520, 294, 1412169728, 289, 1402699776, 282, 164 1393459200, 277, 1384382464, 272, 1375469568, 266, 1366753280, 262, 165 1358168064, 257, 1349746688, 251, 1341521920, 248, 1333395456, 243, 166 1325432832, 238, 1317634048, 235, 1309933568, 230, 1302396928, 227, 167 1294958592, 222, 1287684096, 219, 1280507904, 216, 1273430016, 211, 168 1266515968, 209, 1259667456, 205, 1252950016, 202, 1246330880, 198, 169 1239842816, 196, 1233420288, 192, 1227128832, 190, 1220902912, 187, 170 1214775296, 184, 1208745984, 181, 1202814976, 179, 1196949504, 176, 171 1191182336, 173, 1185513472, 171, 1179910144, 169, 1174372352, 166, 172 1168932864, 164, 1163558912, 162, 1158250496, 160, 1153007616, 157, 173 1147863040, 155, 1142784000, 154, 1137737728, 151, 1132789760, 149, 174 1127907328, 148, 1123057664, 145, 1118306304, 144, 1113587712, 142, 175 1108934656, 140, 1104347136, 138, 1099825152, 137, 1095335936, 135, 176 1090912256, 134, 1086521344, 131, 1082228736, 131, 1077936128, 128 177 }; 178 179 uint32 bbs_invSqrt32( uint32 valA ) 180 { 181 182 uint32 expL = 0; 183 uint32 valL; 184 uint32 offsL; 185 uint32 indexL = 0; 186 187 if( valA == 0U ) return 0x80000000; 188 189 /* compute closest even size exponent of valA */ 190 expL += ( ( valA >> ( expL + 0x10 ) ) != 0 ) << 4; 191 expL += ( ( valA >> ( expL + 0x08 ) ) != 0 ) << 3; 192 expL += ( ( valA >> ( expL + 0x04 ) ) != 0 ) << 2; 193 expL += ( ( valA >> ( expL + 0x02 ) ) != 0 ) << 1; 194 195 valL = ( ( valA << ( 30 - expL ) ) - 1073741824 ); /* ( 1 << 30 ) */ 196 offsL = ( ( valL & 0x01FFFFFF ) + ( 1 << 9 ) ) >> 10; 197 indexL = ( valL >> 24 ) & 0xFE; 198 199 return ( bbs_invSqrt32_tableG[ indexL ] - offsL * bbs_invSqrt32_tableG[ indexL + 1 ] ) >> ( expL >> 1 ); 200 } 201 202 /* ------------------------------------------------------------------------- */ 203 204 /* table of 1/( x + 1 ) (2.30) and negative slope (.14) values 205 referenced in b_GaborCueEm/focusDispAsm.s55, do not rename or remove ! */ 206 const int32 bbs_inv32_tableG[] = 207 { 208 1073741824, 1986, 1041203200, 1870, 1010565120, 1762, 981696512, 1664, 209 954433536, 1575, 928628736, 1491, 904200192, 1415, 881016832, 1345, 210 858980352, 1278, 838041600, 1218, 818085888, 1162, 799047680, 1108, 211 780894208, 1059, 763543552, 1013, 746946560, 970, 731054080, 930, 212 715816960, 891, 701218816, 856, 687194112, 823, 673710080, 791, 213 660750336, 761, 648282112, 732, 636289024, 706, 624721920, 681, 214 613564416, 657, 602800128, 635, 592396288, 613, 582352896, 592, 215 572653568, 573, 563265536, 554, 554188800, 537, 545390592, 520, 216 }; 217 218 int32 bbs_inv32( int32 valA ) 219 { 220 221 uint32 expL = 0; 222 int32 signL = ( ( valA >> 30 ) & 0xFFFFFFFE ) + 1; 223 int32 valL = signL * valA; 224 int32 offsL; 225 uint32 indexL = 0; 226 227 if( valL <= ( int32 ) 1 ) return 0x40000000 * signL; 228 229 /* compute size exponent of valL */ 230 expL += ( ( valL >> ( expL + 0x10 ) ) != 0 ) << 4; 231 expL += ( ( valL >> ( expL + 0x08 ) ) != 0 ) << 3; 232 expL += ( ( valL >> ( expL + 0x04 ) ) != 0 ) << 2; 233 expL += ( ( valL >> ( expL + 0x02 ) ) != 0 ) << 1; 234 expL += ( ( valL >> ( expL + 0x01 ) ) != 0 ); 235 236 valL = ( ( valL << ( 30 - expL ) ) - 1073741824 ); /*( 1U << 30 )*/ 237 offsL = ( ( valL & 0x01FFFFFF ) + ( 1 << 10 ) ) >> 11; 238 indexL = ( valL >> 24 ) & 0xFE; 239 240 return signL * ( ( ( ( bbs_inv32_tableG[ indexL ] - offsL * bbs_inv32_tableG[ indexL + 1 ] ) >> ( expL - 1 ) ) + 1 ) >> 1 ); 241 } 242 243 /* ------------------------------------------------------------------------- */ 244 245 uint32 bbs_intLog2( uint32 valA ) 246 { 247 uint32 expL = 0; 248 expL += 0x10 * ( ( valA >> ( expL + 0x10 ) ) != 0 ); 249 expL += 0x08 * ( ( valA >> ( expL + 0x08 ) ) != 0 ); 250 expL += 0x04 * ( ( valA >> ( expL + 0x04 ) ) != 0 ); 251 expL += 0x02 * ( ( valA >> ( expL + 0x02 ) ) != 0 ); 252 expL += 0x01 * ( ( valA >> ( expL + 0x01 ) ) != 0 ); 253 return expL; 254 } 255 256 /* ------------------------------------------------------------------------- */ 257 258 const uint32 bbs_pow2M1_tableG[] = 259 { 260 0, 713, 46769127, 721, 94047537, 729, 141840775, 737, 261 190154447, 745, 238994221, 753, 288365825, 761, 338275050, 769, 262 388727751, 778, 439729846, 786, 491287318, 795, 543406214, 803, 263 596092647, 812, 649352798, 821, 703192913, 830, 757619310, 839, 264 812638371, 848, 868256550, 857, 924480371, 867, 981316430, 876, 265 1038771393, 886, 1096851999, 895, 1155565062, 905, 1214917468, 915, 266 1274916179, 925, 1335568233, 935, 1396880745, 945, 1458860907, 956, 267 1521515988, 966, 1584853338, 976, 1648880387, 987, 1713604645, 998, 268 1779033703, 1009, 1845175238, 1020, 1912037006, 1031, 1979626852, 1042, 269 2047952702, 1053, 2117022572, 1065, 2186844564u, 1077, 2257426868u, 1088, 270 2328777762u, 1100, 2400905617u, 1112, 2473818892u, 1124, 2547526141u, 1136, 271 2622036010u, 1149, 2697357237u, 1161, 2773498659u, 1174, 2850469207u, 1187, 272 2928277909u, 1200, 3006933892u, 1213, 3086446383u, 1226, 3166824708u, 1239, 273 3248078296u, 1253, 3330216677u, 1266, 3413249486u, 1280, 3497186464u, 1294, 274 3582037455u, 1308, 3667812413u, 1323, 3754521399u, 1337, 3842174584u, 1352, 275 3930782250u, 1366, 4020354790u, 1381, 4110902710u, 1396, 4202436633u, 1411 276 }; 277 278 uint32 bbs_pow2M1( uint32 valA ) 279 { 280 uint32 offsL = ( valA & 0x03FFFFFF ) >> 10; 281 uint16 indexL = ( ( valA & 0xFC000000 ) >> 26 ) << 1; 282 return bbs_pow2M1_tableG[ indexL ] + offsL * bbs_pow2M1_tableG[ indexL + 1 ]; 283 } 284 285 /* ------------------------------------------------------------------------- */ 286 287 uint32 bbs_pow2( int32 valA ) 288 { 289 int32 shiftL = 16 - ( valA >> 27 ); 290 uint32 offsL = ( uint32 )( valA << 5 ); 291 if( shiftL == 32 ) return 1; 292 return ( 1 << ( 32 - shiftL ) ) + ( bbs_pow2M1( offsL ) >> shiftL ); 293 } 294 295 /* ------------------------------------------------------------------------- */ 296 297 uint32 bbs_exp( int32 valA ) 298 { 299 int32 adjustedL; 300 int32 shiftL; 301 int32 offsL; 302 303 /* check boundaries to avoid overflow */ 304 if( valA < -1488522236 ) 305 { 306 return 0; 307 } 308 else if( valA > 1488522236 ) 309 { 310 return 0xFFFFFFFF; 311 } 312 313 /* multily valA with 1/ln(2) in order to use function 2^x instead of e^x */ 314 adjustedL = ( valA >> 16 ) * 94548 + ( ( ( ( ( uint32 )valA ) & 0x0FFFF ) * 47274 ) >> 15 ); 315 316 shiftL = 16 - ( adjustedL >> 27 ); 317 if( shiftL == 32 ) return 1; 318 offsL = ( uint32 )( adjustedL << 5 ); 319 return ( ( int32 ) 1 << ( 32 - shiftL ) ) + ( bbs_pow2M1( offsL ) >> shiftL ); 320 } 321 322 /* ------------------------------------------------------------------------- */ 323 324 int16 bbs_satS16( int32 valA ) 325 { 326 if( valA > 32767 ) return 32767; 327 if( valA < -32768 ) return -32768; 328 return valA; 329 } 330 331 /* ------------------------------------------------------------------------- */ 332 333 #if defined( HW_i586 ) || defined( HW_i686 ) 334 335 /* Windows section */ 336 #if defined( WIN32 ) && !defined( WIN64 ) 337 338 /* disable warning "no return value"*/ 339 #pragma warning( disable : 4035 ) 340 341 /** 342 * computes a fast dot product using intel MMX, sizeA must be multiple of 16 and >0 343 */ 344 int32 bbs_dotProduct_intelMMX16( const int16* vec1A, const int16* vec2A, uint32 sizeA ) 345 { 346 __asm 347 { 348 push esi 349 push edi 350 351 mov eax, vec1A 352 mov ebx, vec2A 353 354 mov ecx, sizeA 355 356 pxor mm4, mm4 357 pxor mm6, mm6 358 359 pxor mm7, mm7 360 shr ecx, 4 361 362 inner_loop_start: 363 movq mm0, 0[eax] 364 paddd mm7, mm4 365 366 movq mm1, 0[ebx] 367 paddd mm7, mm6 368 369 movq mm2, 8[eax] 370 pmaddwd mm0, mm1 371 372 movq mm3, 8[ebx] 373 374 movq mm4, 16[eax] 375 pmaddwd mm2, mm3 376 377 movq mm5, 16[ebx] 378 paddd mm7, mm0 379 380 movq mm6, 24[eax] 381 pmaddwd mm4, mm5 382 383 pmaddwd mm6, 24[ebx] 384 paddd mm7, mm2 385 386 add eax, 32 387 add ebx, 32 388 389 dec ecx 390 jnz inner_loop_start 391 392 paddd mm7, mm4 393 394 paddd mm7, mm6 395 396 movq mm0, mm7 397 398 psrlq mm0, 32 399 400 paddd mm7, mm0 401 402 movd eax, mm7 403 404 emms 405 pop edi 406 pop esi 407 } 408 } 409 410 #pragma warning( default : 4035 ) 411 412 /* gcc compiler section */ 413 #elif defined( epl_LINUX ) || defined( CYGWIN ) 414 415 /** 416 * computes a fast dot product using intel MMX, sizeA must be multiple of 16 and >0 417 */ 418 int32 bbs_dotProduct_intelMMX16( const int16* vec1A, const int16* vec2A, uint32 sizeA ) 419 { 420 int32 resultL; 421 422 __asm__ __volatile__( 423 424 "movl %1,%%eax\n\t" 425 "movl %2,%%ebx\n\t" 426 427 "movl %3,%%ecx\n\t" 428 429 "pxor %%mm4,%%mm4\n\t" 430 "pxor %%mm6,%%mm6\n\t" 431 432 "pxor %%mm7, %%mm7\n\t" 433 "shrl $4, %%ecx\n\t" 434 435 "\n1:\t" 436 "movq 0(%%eax),%%mm0\n\t" 437 "paddd %%mm4,%%mm7\n\t" 438 439 "movq 0( %%ebx ),%%mm1\n\t" 440 "paddd %%mm6,%%mm7\n\t" 441 442 "movq 8( %%eax ),%%mm2\n\t" 443 "pmaddwd %%mm1,%%mm0\n\t" 444 445 "movq 8( %%ebx ),%%mm3\n\t" 446 447 "movq 16( %%eax ),%%mm4\n\t" 448 "pmaddwd %%mm3,%%mm2\n\t" 449 450 "movq 16( %%ebx ),%%mm5\n\t" 451 "paddd %%mm0,%%mm7\n\t" 452 453 "movq 24( %%eax ),%%mm6\n\t" 454 "pmaddwd %%mm5,%%mm4\n\t" 455 456 "pmaddwd 24( %%ebx ),%%mm6\n\t" 457 "paddd %%mm2,%%mm7\n\t" 458 459 "addl $32,%%eax\n\t" 460 "addl $32,%%ebx\n\t" 461 462 "decl %%ecx\n\t" 463 "jnz 1b\n\t" 464 465 "paddd %%mm4,%%mm7\n\t" 466 "paddd %%mm6,%%mm7\n\t" 467 468 "movq %%mm7,%%mm0\n\t" 469 470 "psrlq $32,%%mm0\n\t" 471 472 "paddd %%mm0,%%mm7\n\t" 473 474 "movd %%mm7,%0\n\t" 475 476 "emms\n\t" 477 478 : "=&g" ( resultL ) 479 : "g" ( vec1A ), "g" ( vec2A ), "g" ( sizeA ) 480 : "si", "di", "ax", "bx", "cx", "st", "memory" ); 481 482 return resultL; 483 } 484 485 #endif /* epl_LINUX, CYGWIN */ 486 487 #endif /* HW_i586 || HW_i686 */ 488 489 /* ------------------------------------------------------------------------- */ 490 491 #ifdef HW_TMS320C6x 492 /** 493 * Calls fast assembler version of dotproduct for DSP. 494 * dotProduct_C62x is implemented in file dotprod.asm and expects input vectors 495 * of even length. 496 */ 497 int32 bbs_dotProduct_dsp( const int16* vec1A, const int16* vec2A, uint32 sizeA ) 498 { 499 if( sizeA & 1 ) 500 { 501 int32 resultL; 502 resultL = dotProduct_C62x( vec1A, vec2A, sizeA - 1 ); 503 return resultL + ( int32 ) *( vec1A + sizeA - 1 ) * *( vec2A + sizeA - 1 ); 504 } 505 else 506 { 507 return dotProduct_C62x( vec1A, vec2A, sizeA ); 508 } 509 } 510 #endif /* HW_TMS320C6x */ 511 512 /* ------------------------------------------------------------------------- */ 513 514 /* 16 dot product for the PS2/EE processor */ 515 /* input vectors MUST be 128 bit aligned ! */ 516 517 #if defined( epl_LINUX ) && defined( HW_EE ) 518 519 int32 bbs_dotProduct_EE( const int16* vec1A, const int16* vec2A, uint32 sizeA ) 520 { 521 int32 resultL = 0, 522 iL = sizeA >> 3, 523 jL = sizeA - ( iL << 3 ); 524 525 if( iL > 0 ) 526 { 527 /* multiply-add elements of input vectors in sets of 8 */ 528 int32 accL[ 4 ], t1L, t2L, t3L; 529 asm volatile ( 530 "pxor %4, %2, %2\n\t" /* reset 8 accumulators (LO and HI register) to 0 */ 531 "pmtlo %4\n\t" 532 "pmthi %4\n\t" 533 534 "\n__begin_loop:\t" 535 536 "lq %2,0(%0)\n\t" /* load 8 pairs of int16 */ 537 "lq %3,0(%1)\n\t" 538 539 "addi %0,%0,16\n\t" /* vec1L += 16 */ 540 "addi %1,%1,16\n\t" /* vec2L += 16 */ 541 "addi %7,%7,-1\n\t" /* iL-- */ 542 543 "pmaddh %4,%2,%3\n\t" /* parallel multiply-add of 8 pairs of int16 */ 544 545 "bgtzl %7,__begin_loop\n\t" /* if iL > 0 goto _begin_loop */ 546 547 "pmflo %2\n\t" /* parallel add 8 accumulators , store remaining 4 accumulators in tmpL */ 548 "pmfhi %3\n\t" 549 "paddw %4,%2,%3\n\t" 550 "sq %4,0(%8)\n\t" 551 : "=r" ( vec1A ), "=r" ( vec2A ), "=r" ( t1L ), "=r" ( t2L ), "=r" ( t3L ) 552 : "0" ( vec1A ), "1" ( vec2A ), "r" ( iL ), "r" ( accL ) 553 : "memory" ); 554 555 /* add 4 parallel accumulators */ 556 resultL += accL[ 0 ] + accL[ 1 ] + accL[ 2 ] + accL[ 3 ]; 557 } 558 559 /* multiply-add remaining elements of input vectors */ 560 for( ; jL--; ) resultL += ( int32 ) *vec1A++ * *vec2A++; 561 562 return resultL; 563 } 564 565 #endif 566 567 /* ------------------------------------------------------------------------- */ 568 569 #if defined( HW_ARMv5TE ) 570 571 /* fast 16 dot product for ARM9E cores (DSP extensions). 572 * input vectors must be 32 bit aligned 573 */ 574 int32 bbs_dotProduct_arm9e( const int16* vec1A, const int16* vec2A, uint32 sizeA ) 575 { 576 int32 accuL = 0; 577 578 int32* v1PtrL = ( int32* )vec1A; 579 int32* v2PtrL = ( int32* )vec2A; 580 581 for( ; sizeA >= 4; sizeA -= 4 ) 582 { 583 __asm { 584 smlabb accuL, *v1PtrL, *v2PtrL, accuL; 585 smlatt accuL, *v1PtrL, *v2PtrL, accuL; 586 } 587 v1PtrL++; v2PtrL++; 588 __asm { 589 smlabb accuL, *v1PtrL, *v2PtrL, accuL; 590 smlatt accuL, *v1PtrL, *v2PtrL, accuL; 591 } 592 v1PtrL++; v2PtrL++; 593 } 594 595 vec1A = ( int16* )v1PtrL; 596 vec2A = ( int16* )v2PtrL; 597 598 /* multiply-add remaining elements of input vectors */ 599 for( ; sizeA > 0; sizeA-- ) accuL += ( int32 )*vec1A++ * *vec2A++; 600 601 return accuL; 602 } 603 604 #endif 605 606 /* ------------------------------------------------------------------------- */ 607 608 /** 609 * Computes a fast dot product using standard C 610 */ 611 int32 bbs_dotProduct_stdc( const int16* vec1A, const int16* vec2A, uint32 sizeA ) 612 { 613 int32 accuL = 0; 614 615 for( ; sizeA >= 8; sizeA -= 8 ) 616 { 617 accuL += ( int32 ) *vec1A * *vec2A; 618 accuL += ( int32 ) *( vec1A + 1 ) * *( vec2A + 1 ); 619 accuL += ( int32 ) *( vec1A + 2 ) * *( vec2A + 2 ); 620 accuL += ( int32 ) *( vec1A + 3 ) * *( vec2A + 3 ); 621 622 accuL += ( int32 ) *( vec1A + 4 ) * *( vec2A + 4 ); 623 accuL += ( int32 ) *( vec1A + 5 ) * *( vec2A + 5 ); 624 accuL += ( int32 ) *( vec1A + 6 ) * *( vec2A + 6 ); 625 accuL += ( int32 ) *( vec1A + 7 ) * *( vec2A + 7 ); 626 627 vec1A += 8; 628 vec2A += 8; 629 } 630 631 for( ; sizeA; sizeA-- ) accuL += ( int32 ) *vec1A++ * *vec2A++; 632 633 return accuL; 634 } 635 636 /* ------------------------------------------------------------------------- */ 637 638 int32 bbs_dotProductInt16( const int16* vec1A, const int16* vec2A, uint32 sizeA ) 639 { 640 /* PC */ 641 #if ( defined( HW_i586 ) || defined( HW_i686 ) ) 642 643 #if defined( HW_SSE2 ) 644 uint32 size16L = sizeA & 0xfffffff0; 645 if( size16L ) 646 { 647 if( ( (uint32)vec1A & 0xF ) == 0 && ( (uint32)vec2A & 0xF ) == 0 ) 648 { 649 return bbs_dotProduct_128SSE2( vec1A, vec2A, sizeA ); 650 } 651 else 652 { 653 return bbs_dotProduct_u128SSE2( vec1A, vec2A, sizeA ); 654 } 655 } 656 #elif !defined( WIN64 ) 657 /* MMX version (not supported by 64-bit compiler) */ 658 uint32 size16L = sizeA & 0xfffffff0; 659 if( size16L ) 660 { 661 if( sizeA == size16L ) 662 { 663 return bbs_dotProduct_intelMMX16( vec1A, vec2A, size16L ); 664 } 665 return bbs_dotProduct_intelMMX16( vec1A, vec2A, size16L ) 666 + bbs_dotProduct_stdc( vec1A + size16L, vec2A + size16L, sizeA - size16L ); 667 } /* if( size16L ) */ 668 #endif 669 670 return bbs_dotProduct_stdc( vec1A, vec2A, sizeA ); 671 672 /* Playstation 2 */ 673 #elif defined( HW_EE ) && defined( epl_LINUX ) 674 675 if( ( (uint32)vec1A & 0xF ) == 0 && ( (uint32)vec2A & 0xF ) == 0 ) 676 { 677 return bbs_dotProduct_EE( vec1A, vec2A, sizeA ); 678 } 679 return bbs_dotProduct_stdc( vec1A, vec2A, sizeA ); 680 681 /* ARM9E */ 682 #elif defined( HW_ARMv5TE ) 683 684 return bbs_dotProduct_arm9e( vec1A, vec2A, sizeA ); 685 686 /* TI C6000 */ 687 #elif defined( HW_TMS320C6x ) 688 689 return bbs_dotProduct_dsp( vec1A, vec2A, sizeA ); 690 691 #elif defined( HW_FR71 ) 692 693 uint32 size16L = sizeA & 0xfffffff0; 694 if( size16L ) 695 { 696 if( sizeA == size16L ) 697 { 698 return bbs_dotProduct_fr71( vec1A, vec2A, size16L ); 699 } 700 return bbs_dotProduct_fr71( vec1A, vec2A, size16L ) 701 + bbs_dotProduct_stdc( vec1A + size16L, vec2A + size16L, sizeA - size16L ); 702 } 703 704 return bbs_dotProduct_stdc( vec1A, vec2A, sizeA ); 705 706 #endif 707 708 return bbs_dotProduct_stdc( vec1A, vec2A, sizeA ); 709 } 710 711 /* ------------------------------------------------------------------------- */ 712 713 /* table of fermi and slope values (result: 2.30; offset: .12) 714 referenced in b_NeuralNetEm/FastMlpNet.c, not not rename or remove */ 715 const uint32 bbs_fermi_tableG[] = 716 { 717 45056, 8, 77824, 13, 131072, 21, 217088, 34, 718 356352, 57, 589824, 94, 974848, 155, 1609728, 255, 719 2654208, 418, 4366336, 688, 7184384, 1126, 11796480, 1834, 720 19308544, 2970, 31473664, 4748, 50921472, 7453, 81448960, 11363, 721 127991808, 16573, 195874816, 22680, 288772096, 28469, 405381120, 32102, 722 536870912, 32101, 668356608, 28469, 784965632, 22680, 877862912, 16573, 723 945745920, 11363, 992288768, 7453, 1022816256, 4748, 1042264064, 2970, 724 1054429184, 1834, 1061941248, 1126, 1066553344, 688, 1069371392, 418, 725 1071083520, 255, 1072128000, 155, 1072762880, 94, 1073147904, 57, 726 1073381376, 34, 1073520640, 21, 1073606656, 13, 1073659904, 8, 727 }; 728 729 int32 bbs_fermi( int32 valA ) 730 { 731 uint32 indexL = ( ( valA >> 15 ) + 20 ) << 1; 732 uint32 offsL = ( ( valA & 0x00007FFF ) + 4 ) >> 3; 733 if( valA < -655360 ) return 1; 734 if( valA >= 655360 ) return 1073741824 - 1; /* ( 1 << 30 ) */ 735 return ( bbs_fermi_tableG[ indexL ] + offsL * bbs_fermi_tableG[ indexL + 1 ] ); 736 } 737 738 /* ------------------------------------------------------------------------- */ 739 740 void bbs_uint32ReduceToNBits( uint32* argPtrA, int32* bbpPtrA, uint32 nBitsA ) 741 { 742 int32 posHighestBitL = bbs_intLog2( *argPtrA ) + 1; 743 int32 shiftL = posHighestBitL - nBitsA; 744 if( shiftL > 0 ) 745 { 746 ( *argPtrA ) >>= shiftL; 747 ( *bbpPtrA ) -= shiftL; 748 } 749 } 750 751 /* ------------------------------------------------------------------------- */ 752 753 void bbs_int32ReduceToNBits( int32* argPtrA, int32* bbpPtrA, uint32 nBitsA ) 754 { 755 int32 posHighestBitL = bbs_intLog2( bbs_abs( *argPtrA ) ) + 1; 756 int32 shiftL = posHighestBitL - nBitsA; 757 if( shiftL > 0 ) 758 { 759 ( *argPtrA ) >>= shiftL; 760 ( *bbpPtrA ) -= shiftL; 761 } 762 } 763 764 /* ------------------------------------------------------------------------- */ 765 766 uint32 bbs_convertU32( uint32 srcA, int32 srcBbpA, int32 dstBbpA ) 767 { 768 if( dstBbpA >= srcBbpA ) 769 { 770 uint32 shiftL = dstBbpA - srcBbpA; 771 if( srcA > ( ( uint32 )0xFFFFFFFF >> shiftL ) ) 772 { 773 /* overflow */ 774 return ( uint32 )0xFFFFFFFF; 775 } 776 else 777 { 778 return srcA << shiftL; 779 } 780 } 781 else 782 { 783 uint32 shiftL = srcBbpA - dstBbpA; 784 uint32 addL = 1L << ( shiftL - 1 ); 785 if( srcA + addL < addL ) 786 { 787 /* rounding would cause overflow */ 788 return srcA >> shiftL; 789 } 790 else 791 { 792 return ( srcA + addL ) >> shiftL; 793 } 794 } 795 } 796 797 /* ------------------------------------------------------------------------- */ 798 799 int32 bbs_convertS32( int32 srcA, int32 srcBbpA, int32 dstBbpA ) 800 { 801 if( dstBbpA >= srcBbpA ) 802 { 803 uint32 shiftL = ( uint32 )( dstBbpA - srcBbpA ); 804 if( srcA > ( ( int32 )0x7FFFFFFF >> shiftL ) ) 805 { 806 /* overflow */ 807 return ( uint32 )0x7FFFFFFF; 808 } 809 else if( srcA < ( ( int32 )0x80000000 >> shiftL ) ) 810 { 811 /* underflow */ 812 return ( int32 )0x80000000; 813 } 814 else 815 { 816 return srcA << shiftL; 817 } 818 } 819 else 820 { 821 uint32 shiftL = ( uint32 )( srcBbpA - dstBbpA ); 822 int32 addL = 1L << ( shiftL - 1 ); 823 if( srcA + addL < addL ) 824 { 825 /* rounding would cause overflow */ 826 return srcA >> shiftL; 827 } 828 else 829 { 830 return ( srcA + addL ) >> shiftL; 831 } 832 } 833 } 834 835 /* ------------------------------------------------------------------------- */ 836 837 int32 bbs_vecPowerFlt16( const int16 *xA, int16 nxA ) 838 { 839 /* #if defined( HW_TMS320C5x ) 840 uint32 rL; 841 power( ( int16* ) xA, ( int32* ) &rL, nxA ); // does not work properly in DSPLib version 2.20.02 842 return ( rL >> 1 ); 843 #else*/ 844 /* needs to be optimized */ 845 int32 rL = 0; 846 for( ; nxA--; ) 847 { 848 rL += ( int32 ) *xA * *xA; 849 xA++; 850 } 851 return rL; 852 /* #endif */ 853 } 854 855 /* ------------------------------------------------------------------------- */ 856 857 void bbs_mulU32( uint32 v1A, uint32 v2A, uint32* manPtrA, int32* expPtrA ) 858 { 859 uint32 log1L = bbs_intLog2( v1A ); 860 uint32 log2L = bbs_intLog2( v2A ); 861 862 if( log1L + log2L < 32 ) 863 { 864 *manPtrA = v1A * v2A; 865 *expPtrA = 0; 866 } 867 else 868 { 869 uint32 v1L = v1A; 870 uint32 v2L = v2A; 871 uint32 exp1L = 0; 872 uint32 exp2L = 0; 873 if( log1L > 15 && log2L > 15 ) 874 { 875 exp1L = log1L - 15; 876 exp2L = log2L - 15; 877 v1L = ( ( v1L >> ( exp1L - 1 ) ) + 1 ) >> 1; 878 v2L = ( ( v2L >> ( exp2L - 1 ) ) + 1 ) >> 1; 879 } 880 else if( log1L > 15 ) 881 { 882 exp1L = log1L + log2L - 31; 883 v1L = ( ( v1L >> ( exp1L - 1 ) ) + 1 ) >> 1; 884 } 885 else 886 { 887 exp2L = log1L + log2L - 31; 888 v2L = ( ( v2L >> ( exp2L - 1 ) ) + 1 ) >> 1; 889 } 890 891 *manPtrA = v1L * v2L; 892 *expPtrA = exp1L + exp2L; 893 } 894 } 895 896 /* ------------------------------------------------------------------------- */ 897 898 void bbs_mulS32( int32 v1A, int32 v2A, int32* manPtrA, int32* expPtrA ) 899 { 900 uint32 log1L = bbs_intLog2( v1A > 0 ? v1A : -v1A ); 901 uint32 log2L = bbs_intLog2( v2A > 0 ? v2A : -v2A ); 902 903 if( log1L + log2L < 30 ) 904 { 905 *manPtrA = v1A * v2A; 906 *expPtrA = 0; 907 } 908 else 909 { 910 int32 v1L = v1A; 911 int32 v2L = v2A; 912 int32 exp1L = 0; 913 int32 exp2L = 0; 914 if( log1L > 14 && log2L > 14 ) 915 { 916 exp1L = log1L - 14; 917 exp2L = log2L - 14; 918 v1L = ( ( v1L >> ( exp1L - 1 ) ) + 1 ) >> 1; 919 v2L = ( ( v2L >> ( exp2L - 1 ) ) + 1 ) >> 1; 920 } 921 else if( log1L > 14 ) 922 { 923 exp1L = log1L + log2L - 29; 924 v1L = ( ( v1L >> ( exp1L - 1 ) ) + 1 ) >> 1; 925 } 926 else 927 { 928 exp2L = log1L + log2L - 29; 929 v2L = ( ( v2L >> ( exp2L - 1 ) ) + 1 ) >> 1; 930 } 931 932 *manPtrA = v1L * v2L; 933 *expPtrA = exp1L + exp2L; 934 } 935 } 936 937 /* ------------------------------------------------------------------------- */ 938 939 void bbs_vecSqrNorm32( const int32* vecA, uint32 sizeA, uint32* manPtrA, uint32* expPtrA ) 940 { 941 uint32 sumL = 0; 942 int32 sumExpL = 0; 943 944 uint32 iL; 945 for( iL = 0; iL < sizeA; iL++ ) 946 { 947 int32 vL = vecA[ iL ]; 948 int32 logL = bbs_intLog2( vL > 0 ? vL : -vL ); 949 int32 expL = ( logL > 14 ) ? logL - 14 : 0; 950 uint32 prdL; 951 952 if( expL >= 1 ) 953 { 954 vL = ( ( vL >> ( expL - 1 ) ) + 1 ) >> 1; 955 } 956 else 957 { 958 vL = vL >> expL; 959 } 960 961 prdL = vL * vL; 962 expL <<= 1; /* now exponent of product */ 963 964 if( sumExpL > expL ) 965 { 966 uint32 shrL = sumExpL - expL; 967 prdL = ( ( prdL >> ( shrL - 1 ) ) + 1 ) >> 1; 968 } 969 else if( expL > sumExpL ) 970 { 971 uint32 shrL = expL - sumExpL; 972 sumL = ( ( sumL >> ( shrL - 1 ) ) + 1 ) >> 1; 973 sumExpL += shrL; 974 } 975 976 sumL += prdL; 977 978 if( sumL > 0x80000000 ) 979 { 980 sumL = ( sumL + 1 ) >> 1; 981 sumExpL++; 982 } 983 } 984 985 /* make exponent even */ 986 if( ( sumExpL & 1 ) != 0 ) 987 { 988 sumL = ( sumL + 1 ) >> 1; 989 sumExpL++; 990 } 991 992 if( manPtrA != NULL ) *manPtrA = sumL; 993 if( expPtrA != NULL ) *expPtrA = sumExpL; 994 } 995 996 /* ------------------------------------------------------------------------- */ 997 998 void bbs_vecSqrNorm16( const int16* vecA, uint32 sizeA, uint32* manPtrA, uint32* expPtrA ) 999 { 1000 uint32 sumL = 0; 1001 int32 sumExpL = 0; 1002 1003 uint32 iL; 1004 for( iL = 0; iL < sizeA; iL++ ) 1005 { 1006 int32 vL = vecA[ iL ]; 1007 uint32 prdL = vL * vL; 1008 1009 if( sumExpL > 0 ) 1010 { 1011 uint32 shrL = sumExpL; 1012 prdL = ( ( prdL >> ( shrL - 1 ) ) + 1 ) >> 1; 1013 } 1014 1015 sumL += prdL; 1016 1017 if( sumL > 0x80000000 ) 1018 { 1019 sumL = ( sumL + 1 ) >> 1; 1020 sumExpL++; 1021 } 1022 } 1023 1024 /* make exponent even */ 1025 if( ( sumExpL & 1 ) != 0 ) 1026 { 1027 sumL = ( sumL + 1 ) >> 1; 1028 sumExpL++; 1029 } 1030 1031 if( manPtrA != NULL ) *manPtrA = sumL; 1032 if( expPtrA != NULL ) *expPtrA = sumExpL; 1033 } 1034 1035 /* ------------------------------------------------------------------------- */ 1036 1037 uint32 bbs_vecNorm16( const int16* vecA, uint32 sizeA ) 1038 { 1039 uint32 manL; 1040 uint32 expL; 1041 bbs_vecSqrNorm16( vecA, sizeA, &manL, &expL ); 1042 manL = bbs_sqrt32( manL ); 1043 return manL << ( expL >> 1 ); 1044 } 1045 1046 /* ------------------------------------------------------------------------- */ 1047 1048 void bbs_matMultiplyFlt16( const int16 *x1A, int16 row1A, int16 col1A, const int16 *x2A, int16 col2A, int16 *rA ) 1049 { 1050 #if defined( HW_TMS320C5x ) 1051 /* operands need to be in internal memory for mmul*/ 1052 if( x1A > ( int16* ) bbs_C5X_INTERNAL_MEMORY_SIZE || 1053 x2A > ( int16* ) bbs_C5X_INTERNAL_MEMORY_SIZE ) 1054 { 1055 int16 iL,jL,kL; 1056 int16 *ptr1L, *ptr2L; 1057 int32 sumL; 1058 1059 for( iL = 0; iL < row1A; iL++ ) 1060 { 1061 for( jL = 0; jL < col2A; jL++ ) 1062 { 1063 ptr1L = ( int16* ) x1A + iL * col1A; 1064 ptr2L = ( int16* ) x2A + jL; 1065 sumL = 0; 1066 for( kL = 0; kL < col1A; kL++ ) 1067 { 1068 sumL += ( ( int32 ) *ptr1L++ * *ptr2L ); 1069 ptr2L += col2A; 1070 } 1071 *rA++ = ( sumL + ( 1 << 14 ) ) >> 15; /* round result to 1.15 */ 1072 } 1073 } 1074 } 1075 else mmul( ( int16* ) x1A, row1A, col1A, ( int16* ) x2A, col1A, col2A, rA ); 1076 1077 #elif defined( HW_ARMv4 ) || defined( HW_ARMv5TE ) 1078 1079 int32 iL, jL, kL; 1080 int16 *ptr1L, *ptr2L; 1081 int32 sumL; 1082 for( iL = 0; iL < row1A; iL++ ) 1083 { 1084 for( jL = 0; jL < col2A; jL++ ) 1085 { 1086 ptr1L = ( int16* ) x1A + iL * col1A; 1087 ptr2L = ( int16* ) x2A + jL; 1088 sumL = 0; 1089 for( kL = col1A; kL >= 4; kL -= 4 ) 1090 { 1091 sumL += ( ( int32 ) *ptr1L++ * *ptr2L ); 1092 sumL += ( ( int32 ) *ptr1L++ * *( ptr2L += col2A ) ); 1093 sumL += ( ( int32 ) *ptr1L++ * *( ptr2L += col2A ) ); 1094 sumL += ( ( int32 ) *ptr1L++ * *( ptr2L += col2A ) ); 1095 ptr2L += col2A; 1096 } 1097 for( ; kL > 0; kL-- ) 1098 { 1099 sumL += ( ( int32 ) *ptr1L++ * *ptr2L ); 1100 ptr2L += col2A; 1101 } 1102 *rA++ = ( sumL + ( 1 << 14 ) ) >> 15; /* round result to 1.15 */ 1103 } 1104 } 1105 #else 1106 /* needs to be optimized */ 1107 int16 iL,jL,kL; 1108 int16 *ptr1L, *ptr2L; 1109 int32 sumL; 1110 1111 for( iL = 0; iL < row1A; iL++ ) 1112 { 1113 for( jL = 0; jL < col2A; jL++ ) 1114 { 1115 ptr1L = ( int16* ) x1A + iL * col1A; 1116 ptr2L = ( int16* ) x2A + jL; 1117 sumL = 0; 1118 for( kL = 0; kL < col1A; kL++ ) 1119 { 1120 sumL += ( ( int32 ) *ptr1L++ * *ptr2L ); 1121 ptr2L += col2A; 1122 } 1123 *rA++ = ( sumL + ( 1 << 14 ) ) >> 15; /* round result to 1.15 */ 1124 } 1125 } 1126 #endif 1127 } 1128 1129 /* ------------------------------------------------------------------------- */ 1130 1131 void bbs_matMultiplyTranspFlt16( const int16 *x1A, int16 row1A, int16 col1A, 1132 const int16 *x2A, int16 col2A, int16 *rA ) 1133 { 1134 const int16* ptr1L = x1A; 1135 1136 int32 iL; 1137 for( iL = row1A; iL > 0 ; iL-- ) 1138 { 1139 int32 jL; 1140 const int16* ptr2L = x2A; 1141 for( jL = col2A; jL > 0 ; jL-- ) 1142 { 1143 int32 kL; 1144 int32 sumL = 0; 1145 for( kL = col1A >> 2; kL > 0; kL-- ) 1146 { 1147 sumL += ( ( int32 ) *ptr1L++ * *ptr2L++ ); 1148 sumL += ( ( int32 ) *ptr1L++ * *ptr2L++ ); 1149 sumL += ( ( int32 ) *ptr1L++ * *ptr2L++ ); 1150 sumL += ( ( int32 ) *ptr1L++ * *ptr2L++ ); 1151 } 1152 for( kL = col1A & 3; kL > 0; kL-- ) 1153 { 1154 sumL += ( ( int32 ) *ptr1L++ * *ptr2L++ ); 1155 } 1156 *rA++ = ( sumL + ( 1 << 14 ) ) >> 15; /* round result to 1.15 */ 1157 ptr1L -= col1A; 1158 } 1159 ptr1L += col1A; 1160 } 1161 } 1162 1163 /* ------------------------------------------------------------------------- */ 1164 1165 1166 #ifndef mtrans 1167 uint16 bbs_matTrans( int16 *xA, int16 rowA, int16 colA, int16 *rA ) 1168 { 1169 /* needs to be optimized */ 1170 int16 iL; 1171 for( iL = colA; iL--; ) 1172 { 1173 int16* sL = xA++; 1174 int16 jL; 1175 for( jL = rowA; jL--; ) 1176 { 1177 *rA++ = *sL; 1178 sL += colA; 1179 } 1180 } 1181 return 0; 1182 } 1183 #endif 1184 1185 /* ------------------------------------------------------------------------- */ 1186 #ifndef atan2_16 1187 int16 bbs_atan2( int16 nomA, int16 denomA ) 1188 { 1189 int16 phL, argL; 1190 1191 if( nomA == denomA ) return 8192; 1192 argL = ( ( int32 ) nomA << 15 ) / denomA; 1193 1194 /* 0.318253*2 x 20857 .15 1195 +0.003314*2 x^2 217 .15 1196 -0.130908*2 x^3 -8580 .15 1197 +0.068542*2 x^4 4491 .15 1198 -0.009159*2 x^5 -600 .15 */ 1199 1200 phL = -600; 1201 phL = ( ( ( int32 ) phL * argL ) >> 15 ) + 4481; 1202 phL = ( ( ( int32 ) phL * argL ) >> 15 ) - 8580; 1203 phL = ( ( ( int32 ) phL * argL ) >> 15 ) + 217; 1204 phL = ( ( ( int32 ) phL * argL ) >> 15 ) + 20857; 1205 phL = ( ( int32 ) phL * argL ) >> 15; 1206 1207 return phL >> 1; /* /2 */ 1208 } 1209 1210 /* needs to be optimized */ 1211 uint16 bbs_vecPhase( int16 *reA, int16 *imA, int16 *phaseA, uint16 sizeA ) 1212 { 1213 for( ; sizeA--; ) 1214 { 1215 int16 reL = *reA++; 1216 int16 imL = *imA++; 1217 int16 phL = 0; 1218 1219 if( reL < 0 ) 1220 { 1221 reL = -reL; 1222 if( imL < 0 ) 1223 { 1224 imL = -imL; 1225 if( reL > imL ) 1226 { 1227 phL = -32768 + bbs_atan2( imL, reL ); 1228 } 1229 else 1230 { 1231 phL = -16384 - bbs_atan2( reL, imL ); 1232 } 1233 } 1234 else 1235 { 1236 if( reL > imL ) 1237 { 1238 phL = -( -32768 + bbs_atan2( imL, reL ) ); 1239 } 1240 else 1241 { 1242 if( imL == 0 ) phL = 0; 1243 else phL = 16384 + bbs_atan2( reL, imL ); 1244 } 1245 } 1246 } 1247 else 1248 { 1249 if( imL < 0 ) 1250 { 1251 imL = -imL; 1252 if( reL > imL ) 1253 { 1254 phL = -bbs_atan2( imL, reL ); 1255 } 1256 else 1257 { 1258 phL = -16384 + bbs_atan2( reL, imL ); 1259 } 1260 } 1261 else 1262 { 1263 if( reL > imL ) 1264 { 1265 phL = bbs_atan2( imL, reL ); 1266 } 1267 else 1268 { 1269 if( imL == 0 ) phL = 0; 1270 else phL = 16384 - bbs_atan2( reL, imL ); 1271 } 1272 } 1273 } 1274 1275 *phaseA++ = phL; 1276 } 1277 return 0; 1278 } 1279 1280 #endif 1281 1282 /* ------------------------------------------------------------------------- */ 1283