Home | History | Annotate | Download | only in b_BasicEm
      1 /*
      2  * Copyright (C) 2008 The Android Open Source Project
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  *      http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13  * See the License for the specific language governing permissions and
     14  * limitations under the License.
     15  */
     16 
     17 /* ---- includes ----------------------------------------------------------- */
     18 
     19 #include "b_BasicEm/Math.h"
     20 #include "b_BasicEm/Functions.h"
     21 
     22 /* ---- related objects  --------------------------------------------------- */
     23 
     24 /* ---- typedefs ----------------------------------------------------------- */
     25 
     26 /* ---- constants ---------------------------------------------------------- */
     27 
     28 /* ------------------------------------------------------------------------- */
     29 
     30 /* ========================================================================= */
     31 /*                                                                           */
     32 /* ---- \ghd{ external functions } ----------------------------------------- */
     33 /*                                                                           */
     34 /* ========================================================================= */
     35 
     36 #if defined( HW_SSE2 )
     37 	extern int32 bbs_dotProduct_128SSE2( const int16* vec1A, const int16* vec2A, uint32 sizeA );
     38 	extern int32 bbs_dotProduct_u128SSE2( const int16* vec1A, const int16* vec2A, uint32 sizeA );
     39 #endif
     40 
     41 #if defined( HW_FR71 )
     42 	int32 bbs_dotProduct_fr71( const int16* vec1A, const int16* vec2A, uint32 sizeA );
     43 #endif
     44 
     45 /* ------------------------------------------------------------------------- */
     46 
     47 uint16 bbs_sqrt32( uint32 valA )
     48 {
     49 	uint32 rootL = 0;
     50 	uint32 expL = 0;
     51 	expL += ( ( valA >> ( expL + 0x10 ) ) != 0 ) << 4;
     52 	expL += ( ( valA >> ( expL + 0x08 ) ) != 0 ) << 3;
     53 	expL += ( ( valA >> ( expL + 0x04 ) ) != 0 ) << 2;
     54 	expL += ( ( valA >> ( expL + 0x02 ) ) != 0 ) << 1;
     55 	switch( expL >> 1 )
     56 	{
     57 		case 15: rootL += ( ( rootL + 0x8000 ) * ( rootL + 0x8000 ) <= valA ) << 15;
     58 		case 14: rootL += ( ( rootL + 0x4000 ) * ( rootL + 0x4000 ) <= valA ) << 14;
     59 		case 13: rootL += ( ( rootL + 0x2000 ) * ( rootL + 0x2000 ) <= valA ) << 13;
     60 		case 12: rootL += ( ( rootL + 0x1000 ) * ( rootL + 0x1000 ) <= valA ) << 12;
     61 		case 11: rootL += ( ( rootL + 0x0800 ) * ( rootL + 0x0800 ) <= valA ) << 11;
     62 		case 10: rootL += ( ( rootL + 0x0400 ) * ( rootL + 0x0400 ) <= valA ) << 10;
     63 		case 9:  rootL += ( ( rootL + 0x0200 ) * ( rootL + 0x0200 ) <= valA ) << 9;
     64 		case 8:  rootL += ( ( rootL + 0x0100 ) * ( rootL + 0x0100 ) <= valA ) << 8;
     65 		case 7:  rootL += ( ( rootL + 0x0080 ) * ( rootL + 0x0080 ) <= valA ) << 7;
     66 		case 6:  rootL += ( ( rootL + 0x0040 ) * ( rootL + 0x0040 ) <= valA ) << 6;
     67 		case 5:  rootL += ( ( rootL + 0x0020 ) * ( rootL + 0x0020 ) <= valA ) << 5;
     68 		case 4:  rootL += ( ( rootL + 0x0010 ) * ( rootL + 0x0010 ) <= valA ) << 4;
     69 		case 3:  rootL += ( ( rootL + 0x0008 ) * ( rootL + 0x0008 ) <= valA ) << 3;
     70 		case 2:  rootL += ( ( rootL + 0x0004 ) * ( rootL + 0x0004 ) <= valA ) << 2;
     71 		case 1:  rootL += ( ( rootL + 0x0002 ) * ( rootL + 0x0002 ) <= valA ) << 1;
     72 		case 0:  rootL += ( ( rootL + 0x0001 ) * ( rootL + 0x0001 ) <= valA ) << 0;
     73 	}
     74 
     75 	return ( uint16 )rootL;
     76 }
     77 
     78 /* ------------------------------------------------------------------------- */
     79 
     80 uint8 bbs_sqrt16( uint16 valA )
     81 {
     82 	uint16 rootL = 0;
     83 	rootL += ( ( rootL + 0x80 ) * ( rootL + 0x80 ) <= valA ) << 7;
     84 	rootL += ( ( rootL + 0x40 ) * ( rootL + 0x40 ) <= valA ) << 6;
     85 	rootL += ( ( rootL + 0x20 ) * ( rootL + 0x20 ) <= valA ) << 5;
     86 	rootL += ( ( rootL + 0x10 ) * ( rootL + 0x10 ) <= valA ) << 4;
     87 	rootL += ( ( rootL + 0x08 ) * ( rootL + 0x08 ) <= valA ) << 3;
     88 	rootL += ( ( rootL + 0x04 ) * ( rootL + 0x04 ) <= valA ) << 2;
     89 	rootL += ( ( rootL + 0x02 ) * ( rootL + 0x02 ) <= valA ) << 1;
     90 	rootL += ( ( rootL + 0x01 ) * ( rootL + 0x01 ) <= valA ) << 0;
     91 	return ( uint8 )rootL;
     92 }
     93 
     94 /* ------------------------------------------------------------------------- */
     95 
     96 /* table of sqrt and slope values */
     97 const uint32 bbs_fastSqrt32_tableG[] =
     98 {
     99 	268435456, 1016, 272596992, 1000, 276692992, 987, 280735744, 972,
    100 	284717056, 959, 288645120, 946, 292519936, 933, 296341504, 922,
    101 	300118016, 910, 303845376, 899, 307527680, 889, 311169024, 878,
    102 	314765312, 869, 318324736, 858, 321839104, 850, 325320704, 840,
    103 	328761344, 832, 332169216, 824, 335544320, 815, 338882560, 807,
    104 	342188032, 799, 345460736, 792, 348704768, 785, 351920128, 777,
    105 	355102720, 771, 358260736, 764, 361390080, 757, 364490752, 751,
    106 	367566848, 745, 370618368, 739, 373645312, 732, 376643584, 727,
    107 	379621376, 722, 382578688, 715, 385507328, 711, 388419584, 705,
    108 	391307264, 700, 394174464, 695, 397021184, 689, 399843328, 686,
    109 	402653184, 680, 405438464, 675, 408203264, 672, 410955776, 666,
    110 	413683712, 663, 416399360, 658, 419094528, 653, 421769216, 650,
    111 	424431616, 646, 427077632, 641, 429703168, 638, 432316416, 634,
    112 	434913280, 630, 437493760, 627, 440061952, 622, 442609664, 620,
    113 	445149184, 615, 447668224, 613, 450179072, 609, 452673536, 605,
    114 	455151616, 602, 457617408, 600, 460075008, 595, 462512128, 593,
    115 	464941056, 590, 467357696, 587, 469762048, 583, 472150016, 581,
    116 	474529792, 578, 476897280, 575, 479252480, 572, 481595392, 569,
    117 	483926016, 567, 486248448, 564, 488558592, 561, 490856448, 559,
    118 	493146112, 556, 495423488, 553, 497688576, 552, 499949568, 548,
    119 	502194176, 546, 504430592, 544, 506658816, 541, 508874752, 539,
    120 	511082496, 537, 513282048, 534, 515469312, 533, 517652480, 529,
    121 	519819264, 528, 521981952, 526, 524136448, 523, 526278656, 521,
    122 	528412672, 519, 530538496, 517, 532656128, 515, 534765568, 514
    123 };
    124 
    125 uint16 bbs_fastSqrt32( uint32 valA )
    126 {
    127 	uint32 expL = 0;
    128 	uint32 valL;
    129 	uint32 offsL;
    130 	uint32 indexL = 0;
    131 
    132 	if( valA == 0 ) return 0;
    133 
    134 	/* compute closest even size exponent of valA */
    135 	expL += ( ( valA >> ( expL + 0x10 ) ) != 0 ) << 4;
    136 	expL += ( ( valA >> ( expL + 0x08 ) ) != 0 ) << 3;
    137 	expL += ( ( valA >> ( expL + 0x04 ) ) != 0 ) << 2;
    138 	expL += ( ( valA >> ( expL + 0x02 ) ) != 0 ) << 1;
    139 
    140 	valL = ( ( valA << ( 30 - expL ) ) - 1073741824 ); /* ( 1 << 30 ) */
    141 	offsL = ( ( valL & 0x01FFFFFF ) + ( 1 << 12 ) ) >> 13;
    142 	indexL = ( valL >> 24 ) & 0xFE;
    143 
    144 	return ( bbs_fastSqrt32_tableG[ indexL ] + offsL * bbs_fastSqrt32_tableG[ indexL + 1 ] ) >> ( 28 - ( expL >> 1 ) );
    145 }
    146 
    147 /* ------------------------------------------------------------------------- */
    148 
    149 /* table of 1/sqrt (1.31) and negative slope (.15) values
    150    referenced in b_GaborCueEm/focusDispAsm.s55, do not rename or remove ! */
    151 const uint32 bbs_invSqrt32_tableG[] =
    152 {
    153 	2147483648u, 1001, 2114682880, 956, 2083356672, 915, 2053373952, 877,
    154 	2024636416, 840, 1997111296, 808, 1970634752, 776, 1945206784, 746,
    155 	1920761856, 720, 1897168896, 693, 1874460672, 669, 1852538880, 646,
    156 	1831370752, 625, 1810890752, 604, 1791098880, 584, 1771962368, 567,
    157 	1753382912, 548, 1735426048, 533, 1717960704, 516, 1701052416, 502,
    158 	1684602880, 487, 1668644864, 474, 1653112832, 461, 1638006784, 448,
    159 	1623326720, 436, 1609039872, 426, 1595080704, 414, 1581514752, 404,
    160 	1568276480, 394, 1555365888, 384, 1542782976, 375, 1530494976, 367,
    161 	1518469120, 357, 1506770944, 350, 1495302144, 342, 1484095488, 334,
    162 	1473150976, 327, 1462435840, 320, 1451950080, 313, 1441693696, 307,
    163 	1431633920, 300, 1421803520, 294, 1412169728, 289, 1402699776, 282,
    164 	1393459200, 277, 1384382464, 272, 1375469568, 266, 1366753280, 262,
    165 	1358168064, 257, 1349746688, 251, 1341521920, 248, 1333395456, 243,
    166 	1325432832, 238, 1317634048, 235, 1309933568, 230, 1302396928, 227,
    167 	1294958592, 222, 1287684096, 219, 1280507904, 216, 1273430016, 211,
    168 	1266515968, 209, 1259667456, 205, 1252950016, 202, 1246330880, 198,
    169 	1239842816, 196, 1233420288, 192, 1227128832, 190, 1220902912, 187,
    170 	1214775296, 184, 1208745984, 181, 1202814976, 179, 1196949504, 176,
    171 	1191182336, 173, 1185513472, 171, 1179910144, 169, 1174372352, 166,
    172 	1168932864, 164, 1163558912, 162, 1158250496, 160, 1153007616, 157,
    173 	1147863040, 155, 1142784000, 154, 1137737728, 151, 1132789760, 149,
    174 	1127907328, 148, 1123057664, 145, 1118306304, 144, 1113587712, 142,
    175 	1108934656, 140, 1104347136, 138, 1099825152, 137, 1095335936, 135,
    176 	1090912256, 134, 1086521344, 131, 1082228736, 131, 1077936128, 128
    177 };
    178 
    179 uint32 bbs_invSqrt32( uint32 valA )
    180 {
    181 
    182 	uint32 expL = 0;
    183 	uint32 valL;
    184 	uint32 offsL;
    185 	uint32 indexL = 0;
    186 
    187 	if( valA == 0U ) return 0x80000000;
    188 
    189 	/* compute closest even size exponent of valA */
    190 	expL += ( ( valA >> ( expL + 0x10 ) ) != 0 ) << 4;
    191 	expL += ( ( valA >> ( expL + 0x08 ) ) != 0 ) << 3;
    192 	expL += ( ( valA >> ( expL + 0x04 ) ) != 0 ) << 2;
    193 	expL += ( ( valA >> ( expL + 0x02 ) ) != 0 ) << 1;
    194 
    195 	valL = ( ( valA << ( 30 - expL ) ) - 1073741824 ); /* ( 1 << 30 ) */
    196 	offsL = ( ( valL & 0x01FFFFFF ) + ( 1 << 9 ) ) >> 10;
    197 	indexL = ( valL >> 24 ) & 0xFE;
    198 
    199 	return ( bbs_invSqrt32_tableG[ indexL ] - offsL * bbs_invSqrt32_tableG[ indexL + 1 ] ) >> ( expL >> 1 );
    200 }
    201 
    202 /* ------------------------------------------------------------------------- */
    203 
    204 /* table of 1/( x + 1 ) (2.30) and negative slope (.14) values
    205    referenced in b_GaborCueEm/focusDispAsm.s55, do not rename or remove ! */
    206 const int32 bbs_inv32_tableG[] =
    207 {
    208 	1073741824, 1986, 1041203200, 1870, 1010565120, 1762, 981696512, 1664,
    209 	954433536,  1575, 928628736,  1491, 904200192,  1415, 881016832, 1345,
    210 	858980352,  1278, 838041600,  1218, 818085888,  1162, 799047680, 1108,
    211 	780894208,  1059, 763543552,  1013, 746946560,  970,  731054080, 930,
    212 	715816960,  891,  701218816,  856,  687194112,  823,  673710080, 791,
    213 	660750336,  761,  648282112,  732,  636289024,  706,  624721920, 681,
    214 	613564416,  657,  602800128,  635,  592396288,  613,  582352896, 592,
    215 	572653568,  573,  563265536,  554,  554188800,  537,  545390592, 520,
    216 };
    217 
    218 int32 bbs_inv32( int32 valA )
    219 {
    220 
    221 	uint32 expL = 0;
    222 	int32 signL = ( ( valA >> 30 ) & 0xFFFFFFFE ) + 1;
    223 	int32 valL = signL * valA;
    224 	int32 offsL;
    225 	uint32 indexL = 0;
    226 
    227 	if( valL <= ( int32 ) 1 ) return 0x40000000 * signL;
    228 
    229 	/* compute size exponent of valL */
    230 	expL += ( ( valL >> ( expL + 0x10 ) ) != 0 ) << 4;
    231 	expL += ( ( valL >> ( expL + 0x08 ) ) != 0 ) << 3;
    232 	expL += ( ( valL >> ( expL + 0x04 ) ) != 0 ) << 2;
    233 	expL += ( ( valL >> ( expL + 0x02 ) ) != 0 ) << 1;
    234 	expL += ( ( valL >> ( expL + 0x01 ) ) != 0 );
    235 
    236 	valL = ( ( valL << ( 30 - expL ) ) - 1073741824 ); /*( 1U << 30 )*/
    237 	offsL = ( ( valL & 0x01FFFFFF ) + ( 1 << 10 ) ) >> 11;
    238 	indexL = ( valL >> 24 ) & 0xFE;
    239 
    240 	return signL * ( ( ( ( bbs_inv32_tableG[ indexL ] - offsL * bbs_inv32_tableG[ indexL + 1 ] ) >> ( expL - 1 ) ) + 1 ) >> 1 );
    241 }
    242 
    243 /* ------------------------------------------------------------------------- */
    244 
    245 uint32 bbs_intLog2( uint32 valA )
    246 {
    247 	uint32 expL = 0;
    248 	expL += 0x10 * ( ( valA >> ( expL + 0x10 ) ) != 0 );
    249 	expL += 0x08 * ( ( valA >> ( expL + 0x08 ) ) != 0 );
    250 	expL += 0x04 * ( ( valA >> ( expL + 0x04 ) ) != 0 );
    251 	expL += 0x02 * ( ( valA >> ( expL + 0x02 ) ) != 0 );
    252 	expL += 0x01 * ( ( valA >> ( expL + 0x01 ) ) != 0 );
    253 	return expL;
    254 }
    255 
    256 /* ------------------------------------------------------------------------- */
    257 
    258 const uint32 bbs_pow2M1_tableG[] =
    259 {
    260 	0,			713,	46769127,	721,	94047537,	729,	141840775,	737,
    261 	190154447,	745,	238994221,	753,	288365825,	761,	338275050,	769,
    262 	388727751,	778,	439729846,	786,	491287318,	795,	543406214,	803,
    263 	596092647,	812,	649352798,	821,	703192913,	830,	757619310,	839,
    264 	812638371,	848,	868256550,	857,	924480371,	867,	981316430,	876,
    265 	1038771393, 886,	1096851999, 895,	1155565062, 905,	1214917468, 915,
    266 	1274916179, 925,	1335568233, 935,	1396880745, 945,	1458860907, 956,
    267 	1521515988, 966,	1584853338, 976,	1648880387, 987,	1713604645, 998,
    268 	1779033703, 1009,	1845175238, 1020,	1912037006, 1031,	1979626852, 1042,
    269 	2047952702, 1053,	2117022572, 1065,	2186844564u, 1077,	2257426868u, 1088,
    270 	2328777762u, 1100,	2400905617u, 1112,	2473818892u, 1124,	2547526141u, 1136,
    271 	2622036010u, 1149,	2697357237u, 1161,	2773498659u, 1174,	2850469207u, 1187,
    272 	2928277909u, 1200,	3006933892u, 1213,	3086446383u, 1226,	3166824708u, 1239,
    273 	3248078296u, 1253,	3330216677u, 1266,	3413249486u, 1280,	3497186464u, 1294,
    274 	3582037455u, 1308,	3667812413u, 1323,	3754521399u, 1337,	3842174584u, 1352,
    275 	3930782250u, 1366,	4020354790u, 1381,	4110902710u, 1396,	4202436633u, 1411
    276 };
    277 
    278 uint32 bbs_pow2M1( uint32 valA )
    279 {
    280 	uint32 offsL = ( valA & 0x03FFFFFF ) >> 10;
    281 	uint16 indexL = ( ( valA & 0xFC000000 ) >> 26 ) << 1;
    282 	return bbs_pow2M1_tableG[ indexL ] + offsL * bbs_pow2M1_tableG[ indexL + 1 ];
    283 }
    284 
    285 /* ------------------------------------------------------------------------- */
    286 
    287 uint32 bbs_pow2( int32 valA )
    288 {
    289 	int32 shiftL = 16 - ( valA >> 27 );
    290 	uint32 offsL  = ( uint32 )( valA << 5 );
    291 	if( shiftL == 32 ) return 1;
    292 	return ( 1 << ( 32 - shiftL ) ) + ( bbs_pow2M1( offsL ) >> shiftL );
    293 }
    294 
    295 /* ------------------------------------------------------------------------- */
    296 
    297 uint32 bbs_exp( int32 valA )
    298 {
    299 	int32 adjustedL;
    300 	int32 shiftL;
    301 	int32 offsL;
    302 
    303 	/* check boundaries to avoid overflow */
    304 	if( valA < -1488522236 )
    305 	{
    306 		return 0;
    307 	}
    308 	else if( valA > 1488522236 )
    309 	{
    310 		return 0xFFFFFFFF;
    311 	}
    312 
    313 	/* multily valA with 1/ln(2) in order to use function 2^x instead of e^x */
    314 	adjustedL = ( valA >> 16 ) * 94548 + ( ( ( ( ( uint32 )valA ) & 0x0FFFF ) * 47274 ) >> 15 );
    315 
    316 	shiftL = 16 - ( adjustedL >> 27 );
    317 	if( shiftL == 32 ) return 1;
    318 	offsL  = ( uint32 )( adjustedL << 5 );
    319 	return ( ( int32 ) 1 << ( 32 - shiftL ) ) + ( bbs_pow2M1( offsL ) >> shiftL );
    320 }
    321 
    322 /* ------------------------------------------------------------------------- */
    323 
    324 int16 bbs_satS16( int32 valA )
    325 {
    326 	if( valA > 32767 ) return 32767;
    327 	if( valA < -32768 ) return -32768;
    328 	return valA;
    329 }
    330 
    331 /* ------------------------------------------------------------------------- */
    332 
    333 #if defined( HW_i586 ) || defined( HW_i686 )
    334 
    335 /* Windows section */
    336 #if defined( WIN32 ) && !defined( WIN64 )
    337 
    338 /* disable warning "no return value"*/
    339 #pragma warning( disable : 4035 )
    340 
    341 /**
    342  * computes a fast dot product using intel MMX, sizeA must be multiple of 16 and >0
    343  */
    344 int32 bbs_dotProduct_intelMMX16( const int16* vec1A, const int16* vec2A, uint32 sizeA )
    345 {
    346 	__asm
    347 	{
    348 			push    esi
    349 			push    edi
    350 
    351 			mov     eax, vec1A
    352 			mov     ebx, vec2A
    353 
    354 			mov     ecx, sizeA
    355 
    356 			pxor    mm4, mm4
    357 			pxor    mm6, mm6
    358 
    359 			pxor    mm7, mm7
    360 			shr		ecx, 4
    361 
    362 		inner_loop_start:
    363 			movq    mm0, 0[eax]
    364 			paddd   mm7, mm4
    365 
    366 			movq    mm1, 0[ebx]
    367 			paddd   mm7, mm6
    368 
    369 			movq    mm2, 8[eax]
    370 			pmaddwd mm0, mm1
    371 
    372 			movq    mm3, 8[ebx]
    373 
    374 			movq    mm4, 16[eax]
    375 			pmaddwd mm2, mm3
    376 
    377 			movq    mm5, 16[ebx]
    378 			paddd   mm7, mm0
    379 
    380 			movq    mm6, 24[eax]
    381 			pmaddwd mm4, mm5
    382 
    383 			pmaddwd mm6, 24[ebx]
    384 			paddd   mm7, mm2
    385 
    386 			add     eax, 32
    387 			add     ebx, 32
    388 
    389 			dec     ecx
    390 			jnz     inner_loop_start
    391 
    392 			paddd   mm7, mm4
    393 
    394 			paddd   mm7, mm6
    395 
    396 			movq    mm0, mm7
    397 
    398 			psrlq   mm0, 32
    399 
    400 			paddd   mm7, mm0
    401 
    402 			movd    eax, mm7
    403 
    404 			emms
    405 			pop     edi
    406 			pop     esi
    407 	}
    408 }
    409 
    410 #pragma warning( default : 4035 )
    411 
    412 /* gcc compiler section */
    413 #elif defined( epl_LINUX ) || defined( CYGWIN )
    414 
    415 /**
    416  * computes a fast dot product using intel MMX, sizeA must be multiple of 16 and >0
    417  */
    418 int32 bbs_dotProduct_intelMMX16( const int16* vec1A, const int16* vec2A, uint32 sizeA )
    419 {
    420 	int32 resultL;
    421 
    422 	__asm__ __volatile__(
    423 
    424 			"movl %1,%%eax\n\t"
    425 			"movl %2,%%ebx\n\t"
    426 
    427 			"movl %3,%%ecx\n\t"
    428 
    429 			"pxor %%mm4,%%mm4\n\t"
    430 			"pxor %%mm6,%%mm6\n\t"
    431 
    432 			"pxor %%mm7, %%mm7\n\t"
    433 			"shrl $4, %%ecx\n\t"
    434 
    435 			"\n1:\t"
    436 			"movq 0(%%eax),%%mm0\n\t"
    437 			"paddd %%mm4,%%mm7\n\t"
    438 
    439 			"movq 0( %%ebx ),%%mm1\n\t"
    440 			"paddd %%mm6,%%mm7\n\t"
    441 
    442 			"movq 8( %%eax ),%%mm2\n\t"
    443 			"pmaddwd %%mm1,%%mm0\n\t"
    444 
    445 			"movq 8( %%ebx ),%%mm3\n\t"
    446 
    447 			"movq 16( %%eax ),%%mm4\n\t"
    448 			"pmaddwd %%mm3,%%mm2\n\t"
    449 
    450 			"movq 16( %%ebx ),%%mm5\n\t"
    451 			"paddd %%mm0,%%mm7\n\t"
    452 
    453 			"movq 24( %%eax ),%%mm6\n\t"
    454 			"pmaddwd %%mm5,%%mm4\n\t"
    455 
    456 			"pmaddwd 24( %%ebx ),%%mm6\n\t"
    457 			"paddd %%mm2,%%mm7\n\t"
    458 
    459 			"addl $32,%%eax\n\t"
    460 			"addl $32,%%ebx\n\t"
    461 
    462 			"decl %%ecx\n\t"
    463 			"jnz 1b\n\t"
    464 
    465 			"paddd %%mm4,%%mm7\n\t"
    466 			"paddd %%mm6,%%mm7\n\t"
    467 
    468 			"movq  %%mm7,%%mm0\n\t"
    469 
    470 			"psrlq $32,%%mm0\n\t"
    471 
    472 			"paddd %%mm0,%%mm7\n\t"
    473 
    474 			"movd %%mm7,%0\n\t"
    475 
    476 			"emms\n\t"
    477 
    478 		: "=&g" ( resultL )
    479 		: "g" ( vec1A ), "g" ( vec2A ), "g" ( sizeA )
    480 		: "si", "di", "ax", "bx", "cx", "st", "memory" );
    481 
    482 	return resultL;
    483 }
    484 
    485 #endif /* epl_LINUX, CYGWIN */
    486 
    487 #endif /* HW_i586 || HW_i686 */
    488 
    489 /* ------------------------------------------------------------------------- */
    490 
    491 #ifdef HW_TMS320C6x
    492 /**
    493  * Calls fast assembler version of dotproduct for DSP.
    494  * dotProduct_C62x is implemented in file dotprod.asm and expects input vectors
    495  * of even length.
    496  */
    497 int32 bbs_dotProduct_dsp( const int16* vec1A, const int16* vec2A, uint32 sizeA )
    498 {
    499 	if( sizeA & 1 )
    500 	{
    501 		int32 resultL;
    502 		resultL = dotProduct_C62x( vec1A, vec2A, sizeA - 1 );
    503 		return resultL + ( int32 ) *( vec1A + sizeA - 1 ) * *( vec2A + sizeA - 1 );
    504 	}
    505 	else
    506 	{
    507 		return dotProduct_C62x( vec1A, vec2A, sizeA );
    508 	}
    509 }
    510 #endif /* HW_TMS320C6x */
    511 
    512 /* ------------------------------------------------------------------------- */
    513 
    514 /* 16 dot product for the PS2/EE processor */
    515 /* input vectors MUST be 128 bit aligned ! */
    516 
    517 #if defined( epl_LINUX ) && defined( HW_EE )
    518 
    519 int32 bbs_dotProduct_EE( const int16* vec1A, const int16* vec2A, uint32 sizeA )
    520 {
    521 	int32 resultL = 0,
    522 	      iL = sizeA >> 3,
    523 	      jL = sizeA - ( iL << 3 );
    524 
    525 	if( iL > 0 )
    526 	{
    527 		/* multiply-add elements of input vectors in sets of 8 */
    528 		int32 accL[ 4 ], t1L, t2L, t3L;
    529 		asm volatile (
    530 			"pxor %4, %2, %2\n\t"			/* reset 8 accumulators (LO and HI register) to 0 */
    531 			"pmtlo %4\n\t"
    532 			"pmthi %4\n\t"
    533 
    534 			"\n__begin_loop:\t"
    535 
    536 			"lq %2,0(%0)\n\t"				/* load 8 pairs of int16 */
    537 			"lq %3,0(%1)\n\t"
    538 
    539 			"addi %0,%0,16\n\t"				/* vec1L += 16 */
    540 			"addi %1,%1,16\n\t"				/* vec2L += 16 */
    541 			"addi %7,%7,-1\n\t"				/* iL-- */
    542 
    543 			"pmaddh %4,%2,%3\n\t"			/* parallel multiply-add of 8 pairs of int16 */
    544 
    545 			"bgtzl %7,__begin_loop\n\t"		/* if iL > 0 goto _begin_loop */
    546 
    547 			"pmflo %2\n\t"					/* parallel add 8 accumulators , store remaining 4 accumulators in tmpL */
    548 			"pmfhi %3\n\t"
    549 			"paddw %4,%2,%3\n\t"
    550 			"sq %4,0(%8)\n\t"
    551 		: "=r" ( vec1A ), "=r" ( vec2A ), "=r" ( t1L ), "=r" ( t2L ), "=r" ( t3L )
    552 		: "0" ( vec1A ), "1" ( vec2A ), "r" ( iL ), "r" ( accL )
    553 		: "memory" );
    554 
    555 		/* add 4 parallel accumulators */
    556 		resultL += accL[ 0 ] + accL[ 1 ] + accL[ 2 ] + accL[ 3 ];
    557 	}
    558 
    559 	/* multiply-add remaining elements of input vectors */
    560 	for( ; jL--; ) resultL += ( int32 ) *vec1A++ * *vec2A++;
    561 
    562 	return resultL;
    563 }
    564 
    565 #endif
    566 
    567 /* ------------------------------------------------------------------------- */
    568 
    569 #if defined( HW_ARMv5TE )
    570 
    571 /* fast 16 dot product for ARM9E cores (DSP extensions).
    572  * input vectors must be 32 bit aligned
    573  */
    574 int32 bbs_dotProduct_arm9e( const int16* vec1A, const int16* vec2A, uint32 sizeA )
    575 {
    576 	int32 accuL = 0;
    577 
    578 	int32* v1PtrL = ( int32* )vec1A;
    579 	int32* v2PtrL = ( int32* )vec2A;
    580 
    581 	for( ; sizeA >= 4; sizeA -= 4 )
    582 	{
    583 		__asm {
    584 		    smlabb accuL, *v1PtrL, *v2PtrL, accuL;
    585 		    smlatt accuL, *v1PtrL, *v2PtrL, accuL;
    586 		}
    587 		v1PtrL++; v2PtrL++;
    588 	    __asm {
    589 		    smlabb accuL, *v1PtrL, *v2PtrL, accuL;
    590 		    smlatt accuL, *v1PtrL, *v2PtrL, accuL;
    591 		}
    592 		v1PtrL++; v2PtrL++;
    593 	}
    594 
    595 	vec1A = ( int16* )v1PtrL;
    596 	vec2A = ( int16* )v2PtrL;
    597 
    598 	/* multiply-add remaining elements of input vectors */
    599 	for( ; sizeA > 0; sizeA-- ) accuL += ( int32 )*vec1A++ * *vec2A++;
    600 
    601 	return accuL;
    602 }
    603 
    604 #endif
    605 
    606 /* ------------------------------------------------------------------------- */
    607 
    608 /**
    609  * Computes a fast dot product using standard C
    610  */
    611 int32 bbs_dotProduct_stdc( const int16* vec1A, const int16* vec2A, uint32 sizeA )
    612 {
    613 	int32 accuL = 0;
    614 
    615 	for( ; sizeA >= 8; sizeA -= 8 )
    616 	{
    617 		accuL += ( int32 ) *vec1A * *vec2A;
    618 		accuL += ( int32 ) *( vec1A + 1 ) * *( vec2A + 1 );
    619 		accuL += ( int32 ) *( vec1A + 2 ) * *( vec2A + 2 );
    620 		accuL += ( int32 ) *( vec1A + 3 ) * *( vec2A + 3 );
    621 
    622 		accuL += ( int32 ) *( vec1A + 4 ) * *( vec2A + 4 );
    623 		accuL += ( int32 ) *( vec1A + 5 ) * *( vec2A + 5 );
    624 		accuL += ( int32 ) *( vec1A + 6 ) * *( vec2A + 6 );
    625 		accuL += ( int32 ) *( vec1A + 7 ) * *( vec2A + 7 );
    626 
    627 		vec1A += 8;
    628 		vec2A += 8;
    629 	}
    630 
    631 	for( ; sizeA; sizeA-- ) accuL += ( int32 ) *vec1A++ * *vec2A++;
    632 
    633 	return accuL;
    634 }
    635 
    636 /* ------------------------------------------------------------------------- */
    637 
    638 int32 bbs_dotProductInt16( const int16* vec1A, const int16* vec2A, uint32 sizeA )
    639 {
    640 /* PC */
    641 #if ( defined( HW_i586 ) || defined( HW_i686 ) )
    642 
    643 	#if defined( HW_SSE2 )
    644 		uint32 size16L = sizeA & 0xfffffff0;
    645 		if( size16L )
    646 		{
    647 			if( ( (uint32)vec1A & 0xF ) == 0 && ( (uint32)vec2A & 0xF ) == 0 )
    648 			{
    649 				return bbs_dotProduct_128SSE2( vec1A, vec2A, sizeA );
    650 			}
    651 			else
    652 			{
    653 				return bbs_dotProduct_u128SSE2( vec1A, vec2A, sizeA );
    654 			}
    655 		}
    656 	#elif !defined( WIN64 )
    657 		/* MMX version (not supported by 64-bit compiler) */
    658 		uint32 size16L = sizeA & 0xfffffff0;
    659 		if( size16L )
    660 		{
    661 			if( sizeA == size16L )
    662 			{
    663 				return bbs_dotProduct_intelMMX16( vec1A, vec2A, size16L );
    664 			}
    665 			return bbs_dotProduct_intelMMX16( vec1A, vec2A, size16L )
    666 					+ bbs_dotProduct_stdc( vec1A + size16L, vec2A + size16L, sizeA - size16L );
    667 		} /* if( size16L ) */
    668 	#endif
    669 
    670 	return bbs_dotProduct_stdc( vec1A, vec2A, sizeA );
    671 
    672 /* Playstation 2 */
    673 #elif defined( HW_EE ) && defined( epl_LINUX )
    674 
    675 	if( ( (uint32)vec1A & 0xF ) == 0 && ( (uint32)vec2A & 0xF ) == 0 )
    676 	{
    677 		return bbs_dotProduct_EE( vec1A, vec2A, sizeA );
    678 	}
    679 	return bbs_dotProduct_stdc( vec1A, vec2A, sizeA );
    680 
    681 /* ARM9E */
    682 #elif defined( HW_ARMv5TE )
    683 
    684 	return bbs_dotProduct_arm9e( vec1A, vec2A, sizeA );
    685 
    686 /* TI C6000 */
    687 #elif defined( HW_TMS320C6x )
    688 
    689 	return bbs_dotProduct_dsp( vec1A, vec2A, sizeA );
    690 
    691 #elif defined( HW_FR71 )
    692 
    693 	uint32 size16L = sizeA & 0xfffffff0;
    694 	if( size16L )
    695 	{
    696 		if( sizeA == size16L )
    697 		{
    698 			return bbs_dotProduct_fr71( vec1A, vec2A, size16L );
    699 		}
    700 		return bbs_dotProduct_fr71( vec1A, vec2A, size16L )
    701 				+ bbs_dotProduct_stdc( vec1A + size16L, vec2A + size16L, sizeA - size16L );
    702 	}
    703 
    704 	return bbs_dotProduct_stdc( vec1A, vec2A, sizeA );
    705 
    706 #endif
    707 
    708 	return bbs_dotProduct_stdc( vec1A, vec2A, sizeA );
    709 }
    710 
    711 /* ------------------------------------------------------------------------- */
    712 
    713 /* table of fermi and slope values (result: 2.30; offset: .12)
    714    referenced in b_NeuralNetEm/FastMlpNet.c, not not rename or remove */
    715 const uint32 bbs_fermi_tableG[] =
    716 {
    717 	45056,      8,     77824,      13,    131072,     21,    217088,     34,
    718 	356352,     57,    589824,     94,    974848,     155,   1609728,    255,
    719 	2654208,    418,   4366336,    688,   7184384,    1126,  11796480,   1834,
    720 	19308544,   2970,  31473664,   4748,  50921472,   7453,  81448960,   11363,
    721 	127991808,  16573, 195874816,  22680, 288772096,  28469, 405381120,  32102,
    722 	536870912,  32101, 668356608,  28469, 784965632,  22680, 877862912,  16573,
    723 	945745920,  11363, 992288768,  7453,  1022816256, 4748,  1042264064, 2970,
    724 	1054429184, 1834,  1061941248, 1126,  1066553344, 688,   1069371392, 418,
    725 	1071083520, 255,   1072128000, 155,   1072762880, 94,    1073147904, 57,
    726 	1073381376, 34,    1073520640, 21,    1073606656, 13,    1073659904, 8,
    727 };
    728 
    729 int32 bbs_fermi( int32 valA )
    730 {
    731 	uint32 indexL = ( ( valA >> 15 ) + 20 ) << 1;
    732 	uint32 offsL  = ( ( valA & 0x00007FFF ) + 4 ) >> 3;
    733 	if( valA <  -655360 ) return 1;
    734 	if( valA >=  655360 ) return 1073741824 - 1; /* ( 1 << 30 ) */
    735 	return ( bbs_fermi_tableG[ indexL ] + offsL * bbs_fermi_tableG[ indexL + 1 ] );
    736 }
    737 
    738 /* ------------------------------------------------------------------------- */
    739 
    740 void bbs_uint32ReduceToNBits( uint32* argPtrA, int32* bbpPtrA, uint32 nBitsA )
    741 {
    742 	int32 posHighestBitL = bbs_intLog2( *argPtrA ) + 1;
    743 	int32 shiftL = posHighestBitL - nBitsA;
    744 	if( shiftL > 0 )
    745 	{
    746 		( *argPtrA ) >>= shiftL;
    747 		( *bbpPtrA ) -= shiftL;
    748 	}
    749 }
    750 
    751 /* ------------------------------------------------------------------------- */
    752 
    753 void bbs_int32ReduceToNBits( int32* argPtrA, int32* bbpPtrA, uint32 nBitsA )
    754 {
    755 	int32 posHighestBitL = bbs_intLog2( bbs_abs( *argPtrA ) ) + 1;
    756 	int32 shiftL = posHighestBitL - nBitsA;
    757 	if( shiftL > 0 )
    758 	{
    759 		( *argPtrA ) >>= shiftL;
    760 		( *bbpPtrA ) -= shiftL;
    761 	}
    762 }
    763 
    764 /* ------------------------------------------------------------------------- */
    765 
    766 uint32 bbs_convertU32( uint32 srcA, int32 srcBbpA, int32 dstBbpA )
    767 {
    768 	if( dstBbpA >= srcBbpA )
    769 	{
    770 		uint32 shiftL = dstBbpA - srcBbpA;
    771 		if( srcA > ( ( uint32 )0xFFFFFFFF >> shiftL ) )
    772 		{
    773 			/* overflow */
    774 			return ( uint32 )0xFFFFFFFF;
    775 		}
    776 		else
    777 		{
    778 			return srcA << shiftL;
    779 		}
    780 	}
    781 	else
    782 	{
    783 		uint32 shiftL = srcBbpA - dstBbpA;
    784 		uint32 addL = 1L << ( shiftL - 1 );
    785 		if( srcA + addL < addL )
    786 		{
    787 			/* rounding would cause overflow */
    788 			return srcA >> shiftL;
    789 		}
    790 		else
    791 		{
    792 			return ( srcA + addL ) >> shiftL;
    793 		}
    794 	}
    795 }
    796 
    797 /* ------------------------------------------------------------------------- */
    798 
    799 int32 bbs_convertS32( int32 srcA, int32 srcBbpA, int32 dstBbpA )
    800 {
    801 	if( dstBbpA >= srcBbpA )
    802 	{
    803 		uint32 shiftL = ( uint32 )( dstBbpA - srcBbpA );
    804 		if( srcA > ( ( int32 )0x7FFFFFFF >> shiftL ) )
    805 		{
    806 			/* overflow */
    807 			return ( uint32 )0x7FFFFFFF;
    808 		}
    809 		else if( srcA < ( ( int32 )0x80000000 >> shiftL ) )
    810 		{
    811 			/* underflow */
    812 			return ( int32 )0x80000000;
    813 		}
    814 		else
    815 		{
    816 			return srcA << shiftL;
    817 		}
    818 	}
    819 	else
    820 	{
    821 		uint32 shiftL = ( uint32 )( srcBbpA - dstBbpA );
    822 		int32 addL = 1L << ( shiftL - 1 );
    823 		if( srcA + addL < addL )
    824 		{
    825 			/* rounding would cause overflow */
    826 			return srcA >> shiftL;
    827 		}
    828 		else
    829 		{
    830 			return ( srcA + addL ) >> shiftL;
    831 		}
    832 	}
    833 }
    834 
    835 /* ------------------------------------------------------------------------- */
    836 
    837 int32 bbs_vecPowerFlt16( const int16 *xA, int16 nxA )
    838 {
    839 /*	#if defined( HW_TMS320C5x )
    840 		uint32 rL;
    841 		power( ( int16* ) xA, ( int32* ) &rL, nxA );  // does not work properly in DSPLib version 2.20.02
    842 		return ( rL >> 1 );
    843 	#else*/
    844 		/* needs to be optimized */
    845 		int32 rL = 0;
    846 		for( ; nxA--; )
    847 		{
    848 			rL += ( int32 ) *xA * *xA;
    849 			xA++;
    850 		}
    851 		return rL;
    852 /*	#endif */
    853 }
    854 
    855 /* ------------------------------------------------------------------------- */
    856 
    857 void bbs_mulU32( uint32 v1A, uint32 v2A, uint32* manPtrA, int32* expPtrA )
    858 {
    859 	uint32 log1L = bbs_intLog2( v1A );
    860 	uint32 log2L = bbs_intLog2( v2A );
    861 
    862 	if( log1L + log2L < 32 )
    863 	{
    864 		*manPtrA = v1A * v2A;
    865 		*expPtrA = 0;
    866 	}
    867 	else
    868 	{
    869 		uint32 v1L = v1A;
    870 		uint32 v2L = v2A;
    871 		uint32 exp1L = 0;
    872 		uint32 exp2L = 0;
    873 		if( log1L > 15 && log2L > 15 )
    874 		{
    875 			exp1L = log1L - 15;
    876 			exp2L = log2L - 15;
    877 			v1L = ( ( v1L >> ( exp1L - 1 ) ) + 1 ) >> 1;
    878 			v2L = ( ( v2L >> ( exp2L - 1 ) ) + 1 ) >> 1;
    879 		}
    880 		else if( log1L > 15 )
    881 		{
    882 			exp1L = log1L + log2L - 31;
    883 			v1L = ( ( v1L >> ( exp1L - 1 ) ) + 1 ) >> 1;
    884 		}
    885 		else
    886 		{
    887 			exp2L = log1L + log2L - 31;
    888 			v2L = ( ( v2L >> ( exp2L - 1 ) ) + 1 ) >> 1;
    889 		}
    890 
    891 		*manPtrA = v1L * v2L;
    892 		*expPtrA = exp1L + exp2L;
    893 	}
    894 }
    895 
    896 /* ------------------------------------------------------------------------- */
    897 
    898 void bbs_mulS32( int32 v1A, int32 v2A, int32* manPtrA, int32* expPtrA )
    899 {
    900 	uint32 log1L = bbs_intLog2( v1A > 0 ? v1A : -v1A );
    901 	uint32 log2L = bbs_intLog2( v2A > 0 ? v2A : -v2A );
    902 
    903 	if( log1L + log2L < 30 )
    904 	{
    905 		*manPtrA = v1A * v2A;
    906 		*expPtrA = 0;
    907 	}
    908 	else
    909 	{
    910 		int32 v1L = v1A;
    911 		int32 v2L = v2A;
    912 		int32 exp1L = 0;
    913 		int32 exp2L = 0;
    914 		if( log1L > 14 && log2L > 14 )
    915 		{
    916 			exp1L = log1L - 14;
    917 			exp2L = log2L - 14;
    918 			v1L = ( ( v1L >> ( exp1L - 1 ) ) + 1 ) >> 1;
    919 			v2L = ( ( v2L >> ( exp2L - 1 ) ) + 1 ) >> 1;
    920 		}
    921 		else if( log1L > 14 )
    922 		{
    923 			exp1L = log1L + log2L - 29;
    924 			v1L = ( ( v1L >> ( exp1L - 1 ) ) + 1 ) >> 1;
    925 		}
    926 		else
    927 		{
    928 			exp2L = log1L + log2L - 29;
    929 			v2L = ( ( v2L >> ( exp2L - 1 ) ) + 1 ) >> 1;
    930 		}
    931 
    932 		*manPtrA = v1L * v2L;
    933 		*expPtrA = exp1L + exp2L;
    934 	}
    935 }
    936 
    937 /* ------------------------------------------------------------------------- */
    938 
    939 void bbs_vecSqrNorm32( const int32* vecA, uint32 sizeA, uint32* manPtrA, uint32* expPtrA )
    940 {
    941 	uint32 sumL = 0;
    942 	int32 sumExpL = 0;
    943 
    944 	uint32 iL;
    945 	for( iL = 0; iL < sizeA; iL++ )
    946 	{
    947 		int32 vL = vecA[ iL ];
    948 		int32 logL = bbs_intLog2( vL > 0 ? vL : -vL );
    949 		int32 expL = ( logL > 14 ) ? logL - 14 : 0;
    950 		uint32 prdL;
    951 
    952 		if( expL >= 1 )
    953 		{
    954 			vL = ( ( vL >> ( expL - 1 ) ) + 1 ) >> 1;
    955 		}
    956 		else
    957 		{
    958 			vL = vL >> expL;
    959 		}
    960 
    961 		prdL = vL * vL;
    962 		expL <<= 1; /* now exponent of product */
    963 
    964 		if( sumExpL > expL )
    965 		{
    966 			uint32 shrL = sumExpL - expL;
    967 			prdL = ( ( prdL >> ( shrL - 1 ) ) + 1 ) >> 1;
    968 		}
    969 		else if( expL > sumExpL )
    970 		{
    971 			uint32 shrL = expL - sumExpL;
    972 			sumL = ( ( sumL >> ( shrL - 1 ) ) + 1 ) >> 1;
    973 			sumExpL += shrL;
    974 		}
    975 
    976 		sumL += prdL;
    977 
    978 		if( sumL > 0x80000000 )
    979 		{
    980 			sumL = ( sumL + 1 ) >> 1;
    981 			sumExpL++;
    982 		}
    983 	}
    984 
    985 	/* make exponent even */
    986 	if( ( sumExpL & 1 ) != 0 )
    987 	{
    988 		sumL = ( sumL + 1 ) >> 1;
    989 		sumExpL++;
    990 	}
    991 
    992 	if( manPtrA != NULL ) *manPtrA = sumL;
    993 	if( expPtrA != NULL ) *expPtrA = sumExpL;
    994 }
    995 
    996 /* ------------------------------------------------------------------------- */
    997 
    998 void bbs_vecSqrNorm16( const int16* vecA, uint32 sizeA, uint32* manPtrA, uint32* expPtrA )
    999 {
   1000 	uint32 sumL = 0;
   1001 	int32 sumExpL = 0;
   1002 
   1003 	uint32 iL;
   1004 	for( iL = 0; iL < sizeA; iL++ )
   1005 	{
   1006 		int32 vL = vecA[ iL ];
   1007 		uint32 prdL = vL * vL;
   1008 
   1009 		if( sumExpL > 0 )
   1010 		{
   1011 			uint32 shrL = sumExpL;
   1012 			prdL = ( ( prdL >> ( shrL - 1 ) ) + 1 ) >> 1;
   1013 		}
   1014 
   1015 		sumL += prdL;
   1016 
   1017 		if( sumL > 0x80000000 )
   1018 		{
   1019 			sumL = ( sumL + 1 ) >> 1;
   1020 			sumExpL++;
   1021 		}
   1022 	}
   1023 
   1024 	/* make exponent even */
   1025 	if( ( sumExpL & 1 ) != 0 )
   1026 	{
   1027 		sumL = ( sumL + 1 ) >> 1;
   1028 		sumExpL++;
   1029 	}
   1030 
   1031 	if( manPtrA != NULL ) *manPtrA = sumL;
   1032 	if( expPtrA != NULL ) *expPtrA = sumExpL;
   1033 }
   1034 
   1035 /* ------------------------------------------------------------------------- */
   1036 
   1037 uint32 bbs_vecNorm16( const int16* vecA, uint32 sizeA )
   1038 {
   1039 	uint32 manL;
   1040 	uint32 expL;
   1041 	bbs_vecSqrNorm16( vecA, sizeA, &manL, &expL );
   1042 	manL = bbs_sqrt32( manL );
   1043 	return manL << ( expL >> 1 );
   1044 }
   1045 
   1046 /* ------------------------------------------------------------------------- */
   1047 
   1048 void bbs_matMultiplyFlt16( const int16 *x1A, int16 row1A, int16 col1A, const int16 *x2A, int16 col2A, int16 *rA )
   1049 {
   1050 	#if defined( HW_TMS320C5x )
   1051 		/* operands need to be in internal memory for mmul*/
   1052 		if( x1A > ( int16* ) bbs_C5X_INTERNAL_MEMORY_SIZE ||
   1053 			x2A > ( int16* ) bbs_C5X_INTERNAL_MEMORY_SIZE )
   1054 		{
   1055 			int16 iL,jL,kL;
   1056 			int16 *ptr1L, *ptr2L;
   1057 			int32 sumL;
   1058 
   1059 			for( iL = 0; iL < row1A; iL++ )
   1060 			{
   1061 				for( jL = 0; jL < col2A; jL++ )
   1062 				{
   1063 					ptr1L = ( int16* ) x1A + iL * col1A;
   1064 					ptr2L = ( int16* ) x2A + jL;
   1065 					sumL = 0;
   1066 					for( kL = 0; kL < col1A; kL++ )
   1067 					{
   1068 						sumL += ( ( int32 ) *ptr1L++ * *ptr2L );
   1069 						ptr2L += col2A;
   1070 					}
   1071 					*rA++ = ( sumL + ( 1 << 14 ) ) >> 15; /* round result to 1.15 */
   1072 				}
   1073 			}
   1074 		}
   1075 		else mmul( ( int16* ) x1A, row1A, col1A, ( int16* ) x2A, col1A, col2A, rA );
   1076 
   1077 	#elif defined( HW_ARMv4 ) || defined( HW_ARMv5TE )
   1078 
   1079 		int32 iL, jL, kL;
   1080 		int16 *ptr1L, *ptr2L;
   1081 		int32 sumL;
   1082 		for( iL = 0; iL < row1A; iL++ )
   1083 		{
   1084 			for( jL = 0; jL < col2A; jL++ )
   1085 			{
   1086 				ptr1L = ( int16* ) x1A + iL * col1A;
   1087 				ptr2L = ( int16* ) x2A + jL;
   1088 				sumL = 0;
   1089 				for( kL = col1A; kL >= 4; kL -= 4 )
   1090 				{
   1091 					sumL += ( ( int32 ) *ptr1L++ * *ptr2L );
   1092 					sumL += ( ( int32 ) *ptr1L++ * *( ptr2L += col2A ) );
   1093 					sumL += ( ( int32 ) *ptr1L++ * *( ptr2L += col2A ) );
   1094 					sumL += ( ( int32 ) *ptr1L++ * *( ptr2L += col2A ) );
   1095 					ptr2L += col2A;
   1096 				}
   1097 				for( ; kL > 0; kL-- )
   1098 				{
   1099 					sumL += ( ( int32 ) *ptr1L++ * *ptr2L );
   1100 					ptr2L += col2A;
   1101 				}
   1102 				*rA++ = ( sumL + ( 1 << 14 ) ) >> 15; /* round result to 1.15 */
   1103 			}
   1104 		}
   1105 	#else
   1106 		/* needs to be optimized */
   1107 		int16 iL,jL,kL;
   1108 		int16 *ptr1L, *ptr2L;
   1109 		int32 sumL;
   1110 
   1111 		for( iL = 0; iL < row1A; iL++ )
   1112 		{
   1113 			for( jL = 0; jL < col2A; jL++ )
   1114 			{
   1115 				ptr1L = ( int16* ) x1A + iL * col1A;
   1116 				ptr2L = ( int16* ) x2A + jL;
   1117 				sumL = 0;
   1118 				for( kL = 0; kL < col1A; kL++ )
   1119 				{
   1120 					sumL += ( ( int32 ) *ptr1L++ * *ptr2L );
   1121 					ptr2L += col2A;
   1122 				}
   1123 				*rA++ = ( sumL + ( 1 << 14 ) ) >> 15; /* round result to 1.15 */
   1124 			}
   1125 		}
   1126 	#endif
   1127 }
   1128 
   1129 /* ------------------------------------------------------------------------- */
   1130 
   1131 void bbs_matMultiplyTranspFlt16( const int16 *x1A, int16 row1A, int16 col1A,
   1132 								 const int16 *x2A, int16 col2A, int16 *rA )
   1133 {
   1134 	const int16* ptr1L = x1A;
   1135 
   1136 	int32 iL;
   1137 	for( iL = row1A; iL > 0 ; iL-- )
   1138 	{
   1139 		int32 jL;
   1140 		const int16* ptr2L = x2A;
   1141 		for( jL = col2A; jL > 0 ; jL-- )
   1142 		{
   1143 			int32 kL;
   1144 			int32 sumL = 0;
   1145 			for( kL = col1A >> 2; kL > 0; kL-- )
   1146 			{
   1147 				sumL += ( ( int32 ) *ptr1L++ * *ptr2L++ );
   1148 				sumL += ( ( int32 ) *ptr1L++ * *ptr2L++ );
   1149 				sumL += ( ( int32 ) *ptr1L++ * *ptr2L++ );
   1150 				sumL += ( ( int32 ) *ptr1L++ * *ptr2L++ );
   1151 			}
   1152 			for( kL = col1A & 3; kL > 0; kL-- )
   1153 			{
   1154 				sumL += ( ( int32 ) *ptr1L++ * *ptr2L++ );
   1155 			}
   1156 			*rA++ = ( sumL + ( 1 << 14 ) ) >> 15; /* round result to 1.15 */
   1157 			ptr1L -= col1A;
   1158 		}
   1159 		ptr1L += col1A;
   1160 	}
   1161 }
   1162 
   1163 /* ------------------------------------------------------------------------- */
   1164 
   1165 
   1166 #ifndef mtrans
   1167 uint16 bbs_matTrans( int16 *xA, int16 rowA, int16 colA, int16 *rA )
   1168 {
   1169 	/* needs to be optimized */
   1170 	int16 iL;
   1171 	for( iL = colA; iL--; )
   1172 	{
   1173 		int16* sL = xA++;
   1174 		int16 jL;
   1175 		for( jL = rowA; jL--; )
   1176 		{
   1177 			*rA++ = *sL;
   1178 			sL += colA;
   1179 		}
   1180 	}
   1181 	return 0;
   1182 }
   1183 #endif
   1184 
   1185 /* ------------------------------------------------------------------------- */
   1186 #ifndef atan2_16
   1187 int16 bbs_atan2( int16 nomA, int16 denomA )
   1188 {
   1189 	int16 phL, argL;
   1190 
   1191 	if( nomA == denomA ) return 8192;
   1192 	argL = ( ( int32 ) nomA << 15 ) / denomA;
   1193 
   1194 	/* 0.318253*2 x      20857 .15
   1195 	  +0.003314*2 x^2      217 .15
   1196 	  -0.130908*2 x^3    -8580 .15
   1197 	  +0.068542*2 x^4     4491 .15
   1198 	  -0.009159*2 x^5     -600 .15 */
   1199 
   1200 	phL = -600;
   1201 	phL = ( ( ( int32 ) phL * argL ) >> 15 ) + 4481;
   1202 	phL = ( ( ( int32 ) phL * argL ) >> 15 ) - 8580;
   1203 	phL = ( ( ( int32 ) phL * argL ) >> 15 ) + 217;
   1204 	phL = ( ( ( int32 ) phL * argL ) >> 15 ) + 20857;
   1205 	phL = ( ( int32 ) phL * argL ) >> 15;
   1206 
   1207 	return phL >> 1; /* /2 */
   1208 }
   1209 
   1210 /* needs to be optimized */
   1211 uint16 bbs_vecPhase( int16 *reA, int16 *imA, int16 *phaseA, uint16 sizeA )
   1212 {
   1213 	for( ; sizeA--; )
   1214 	{
   1215 		int16 reL = *reA++;
   1216 		int16 imL = *imA++;
   1217 		int16 phL = 0;
   1218 
   1219 		if( reL < 0 )
   1220 		{
   1221 			reL = -reL;
   1222 			if( imL < 0 )
   1223 			{
   1224 				imL = -imL;
   1225 				if( reL > imL )
   1226 				{
   1227 					phL = -32768 + bbs_atan2( imL, reL );
   1228 				}
   1229 				else
   1230 				{
   1231 					phL = -16384 - bbs_atan2( reL, imL );
   1232 				}
   1233 			}
   1234 			else
   1235 			{
   1236 				if( reL > imL )
   1237 				{
   1238 					phL = -( -32768 + bbs_atan2( imL, reL ) );
   1239 				}
   1240 				else
   1241 				{
   1242 					if( imL == 0 ) phL = 0;
   1243 					else phL = 16384 + bbs_atan2( reL, imL );
   1244 				}
   1245 			}
   1246 		}
   1247 		else
   1248 		{
   1249 			if( imL < 0 )
   1250 			{
   1251 				imL = -imL;
   1252 				if( reL > imL )
   1253 				{
   1254 					phL = -bbs_atan2( imL, reL );
   1255 				}
   1256 				else
   1257 				{
   1258 					phL = -16384 + bbs_atan2( reL, imL );
   1259 				}
   1260 			}
   1261 			else
   1262 			{
   1263 				if( reL > imL )
   1264 				{
   1265 					phL = bbs_atan2( imL, reL );
   1266 				}
   1267 				else
   1268 				{
   1269 					if( imL == 0 ) phL = 0;
   1270 					else phL = 16384 - bbs_atan2( reL, imL );
   1271 				}
   1272 			}
   1273 		}
   1274 
   1275 		*phaseA++ = phL;
   1276 	}
   1277 	return 0;
   1278 }
   1279 
   1280 #endif
   1281 
   1282 /* ------------------------------------------------------------------------- */
   1283