1 /*M/////////////////////////////////////////////////////////////////////////////////////// 2 // 3 // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. 4 // 5 // By downloading, copying, installing or using the software you agree to this license. 6 // If you do not agree to this license, do not download, install, 7 // copy or use the software. 8 // 9 // 10 // License Agreement 11 // For Open Source Computer Vision Library 12 // 13 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved. 14 // Copyright (C) 2009-2011, Willow Garage Inc., all rights reserved. 15 // Third party copyrights are property of their respective owners. 16 // 17 // Redistribution and use in source and binary forms, with or without modification, 18 // are permitted provided that the following conditions are met: 19 // 20 // * Redistribution's of source code must retain the above copyright notice, 21 // this list of conditions and the following disclaimer. 22 // 23 // * Redistribution's in binary form must reproduce the above copyright notice, 24 // this list of conditions and the following disclaimer in the documentation 25 // and/or other materials provided with the distribution. 26 // 27 // * The name of the copyright holders may not be used to endorse or promote products 28 // derived from this software without specific prior written permission. 29 // 30 // This software is provided by the copyright holders and contributors "as is" and 31 // any express or implied warranties, including, but not limited to, the implied 32 // warranties of merchantability and fitness for a particular purpose are disclaimed. 33 // In no event shall the Intel Corporation or contributors be liable for any direct, 34 // indirect, incidental, special, exemplary, or consequential damages 35 // (including, but not limited to, procurement of substitute goods or services; 36 // loss of use, data, or profits; or business interruption) however caused 37 // and on any theory of liability, whether in contract, strict liability, 38 // or tort (including negligence or otherwise) arising in any way out of 39 // the use of this software, even if advised of the possibility of such damage. 40 // 41 //M*/ 42 43 #include "precomp.hpp" 44 45 #undef HAVE_IPP 46 47 namespace cv { namespace hal { 48 49 ///////////////////////////////////// ATAN2 //////////////////////////////////// 50 static const float atan2_p1 = 0.9997878412794807f*(float)(180/CV_PI); 51 static const float atan2_p3 = -0.3258083974640975f*(float)(180/CV_PI); 52 static const float atan2_p5 = 0.1555786518463281f*(float)(180/CV_PI); 53 static const float atan2_p7 = -0.04432655554792128f*(float)(180/CV_PI); 54 55 #if CV_NEON 56 static inline float32x4_t cv_vrecpq_f32(float32x4_t val) 57 { 58 float32x4_t reciprocal = vrecpeq_f32(val); 59 reciprocal = vmulq_f32(vrecpsq_f32(val, reciprocal), reciprocal); 60 reciprocal = vmulq_f32(vrecpsq_f32(val, reciprocal), reciprocal); 61 return reciprocal; 62 } 63 #endif 64 65 void fastAtan2(const float *Y, const float *X, float *angle, int len, bool angleInDegrees ) 66 { 67 int i = 0; 68 float scale = angleInDegrees ? 1 : (float)(CV_PI/180); 69 70 #ifdef HAVE_TEGRA_OPTIMIZATION 71 if (tegra::useTegra() && tegra::FastAtan2_32f(Y, X, angle, len, scale)) 72 return; 73 #endif 74 75 #if CV_SSE2 76 Cv32suf iabsmask; iabsmask.i = 0x7fffffff; 77 __m128 eps = _mm_set1_ps((float)DBL_EPSILON), absmask = _mm_set1_ps(iabsmask.f); 78 __m128 _90 = _mm_set1_ps(90.f), _180 = _mm_set1_ps(180.f), _360 = _mm_set1_ps(360.f); 79 __m128 z = _mm_setzero_ps(), scale4 = _mm_set1_ps(scale); 80 __m128 p1 = _mm_set1_ps(atan2_p1), p3 = _mm_set1_ps(atan2_p3); 81 __m128 p5 = _mm_set1_ps(atan2_p5), p7 = _mm_set1_ps(atan2_p7); 82 83 for( ; i <= len - 4; i += 4 ) 84 { 85 __m128 x = _mm_loadu_ps(X + i), y = _mm_loadu_ps(Y + i); 86 __m128 ax = _mm_and_ps(x, absmask), ay = _mm_and_ps(y, absmask); 87 __m128 mask = _mm_cmplt_ps(ax, ay); 88 __m128 tmin = _mm_min_ps(ax, ay), tmax = _mm_max_ps(ax, ay); 89 __m128 c = _mm_div_ps(tmin, _mm_add_ps(tmax, eps)); 90 __m128 c2 = _mm_mul_ps(c, c); 91 __m128 a = _mm_mul_ps(c2, p7); 92 a = _mm_mul_ps(_mm_add_ps(a, p5), c2); 93 a = _mm_mul_ps(_mm_add_ps(a, p3), c2); 94 a = _mm_mul_ps(_mm_add_ps(a, p1), c); 95 96 __m128 b = _mm_sub_ps(_90, a); 97 a = _mm_xor_ps(a, _mm_and_ps(_mm_xor_ps(a, b), mask)); 98 99 b = _mm_sub_ps(_180, a); 100 mask = _mm_cmplt_ps(x, z); 101 a = _mm_xor_ps(a, _mm_and_ps(_mm_xor_ps(a, b), mask)); 102 103 b = _mm_sub_ps(_360, a); 104 mask = _mm_cmplt_ps(y, z); 105 a = _mm_xor_ps(a, _mm_and_ps(_mm_xor_ps(a, b), mask)); 106 107 a = _mm_mul_ps(a, scale4); 108 _mm_storeu_ps(angle + i, a); 109 } 110 #elif CV_NEON 111 float32x4_t eps = vdupq_n_f32((float)DBL_EPSILON); 112 float32x4_t _90 = vdupq_n_f32(90.f), _180 = vdupq_n_f32(180.f), _360 = vdupq_n_f32(360.f); 113 float32x4_t z = vdupq_n_f32(0.0f), scale4 = vdupq_n_f32(scale); 114 float32x4_t p1 = vdupq_n_f32(atan2_p1), p3 = vdupq_n_f32(atan2_p3); 115 float32x4_t p5 = vdupq_n_f32(atan2_p5), p7 = vdupq_n_f32(atan2_p7); 116 117 for( ; i <= len - 4; i += 4 ) 118 { 119 float32x4_t x = vld1q_f32(X + i), y = vld1q_f32(Y + i); 120 float32x4_t ax = vabsq_f32(x), ay = vabsq_f32(y); 121 float32x4_t tmin = vminq_f32(ax, ay), tmax = vmaxq_f32(ax, ay); 122 float32x4_t c = vmulq_f32(tmin, cv_vrecpq_f32(vaddq_f32(tmax, eps))); 123 float32x4_t c2 = vmulq_f32(c, c); 124 float32x4_t a = vmulq_f32(c2, p7); 125 a = vmulq_f32(vaddq_f32(a, p5), c2); 126 a = vmulq_f32(vaddq_f32(a, p3), c2); 127 a = vmulq_f32(vaddq_f32(a, p1), c); 128 129 a = vbslq_f32(vcgeq_f32(ax, ay), a, vsubq_f32(_90, a)); 130 a = vbslq_f32(vcltq_f32(x, z), vsubq_f32(_180, a), a); 131 a = vbslq_f32(vcltq_f32(y, z), vsubq_f32(_360, a), a); 132 133 vst1q_f32(angle + i, vmulq_f32(a, scale4)); 134 } 135 #endif 136 137 for( ; i < len; i++ ) 138 { 139 float x = X[i], y = Y[i]; 140 float ax = std::abs(x), ay = std::abs(y); 141 float a, c, c2; 142 if( ax >= ay ) 143 { 144 c = ay/(ax + (float)DBL_EPSILON); 145 c2 = c*c; 146 a = (((atan2_p7*c2 + atan2_p5)*c2 + atan2_p3)*c2 + atan2_p1)*c; 147 } 148 else 149 { 150 c = ax/(ay + (float)DBL_EPSILON); 151 c2 = c*c; 152 a = 90.f - (((atan2_p7*c2 + atan2_p5)*c2 + atan2_p3)*c2 + atan2_p1)*c; 153 } 154 if( x < 0 ) 155 a = 180.f - a; 156 if( y < 0 ) 157 a = 360.f - a; 158 angle[i] = (float)(a*scale); 159 } 160 } 161 162 163 void magnitude(const float* x, const float* y, float* mag, int len) 164 { 165 #if defined HAVE_IPP 166 CV_IPP_CHECK() 167 { 168 IppStatus status = ippsMagnitude_32f(x, y, mag, len); 169 if (status >= 0) 170 { 171 CV_IMPL_ADD(CV_IMPL_IPP); 172 return; 173 } 174 setIppErrorStatus(); 175 } 176 #endif 177 178 int i = 0; 179 180 #if CV_SIMD128 181 for( ; i <= len - 8; i += 8 ) 182 { 183 v_float32x4 x0 = v_load(x + i), x1 = v_load(x + i + 4); 184 v_float32x4 y0 = v_load(y + i), y1 = v_load(y + i + 4); 185 x0 = v_sqrt(v_muladd(x0, x0, y0*y0)); 186 x1 = v_sqrt(v_muladd(x1, x1, y1*y1)); 187 v_store(mag + i, x0); 188 v_store(mag + i + 4, x1); 189 } 190 #endif 191 192 for( ; i < len; i++ ) 193 { 194 float x0 = x[i], y0 = y[i]; 195 mag[i] = std::sqrt(x0*x0 + y0*y0); 196 } 197 } 198 199 void magnitude(const double* x, const double* y, double* mag, int len) 200 { 201 #if defined(HAVE_IPP) 202 CV_IPP_CHECK() 203 { 204 IppStatus status = ippsMagnitude_64f(x, y, mag, len); 205 if (status >= 0) 206 { 207 CV_IMPL_ADD(CV_IMPL_IPP); 208 return; 209 } 210 setIppErrorStatus(); 211 } 212 #endif 213 214 int i = 0; 215 216 #if CV_SIMD128_64F 217 for( ; i <= len - 4; i += 4 ) 218 { 219 v_float64x2 x0 = v_load(x + i), x1 = v_load(x + i + 2); 220 v_float64x2 y0 = v_load(y + i), y1 = v_load(y + i + 2); 221 x0 = v_sqrt(v_muladd(x0, x0, y0*y0)); 222 x1 = v_sqrt(v_muladd(x1, x1, y1*y1)); 223 v_store(mag + i, x0); 224 v_store(mag + i + 2, x1); 225 } 226 #endif 227 228 for( ; i < len; i++ ) 229 { 230 double x0 = x[i], y0 = y[i]; 231 mag[i] = std::sqrt(x0*x0 + y0*y0); 232 } 233 } 234 235 236 void invSqrt(const float* src, float* dst, int len) 237 { 238 #if defined(HAVE_IPP) 239 CV_IPP_CHECK() 240 { 241 if (ippsInvSqrt_32f_A21(src, dst, len) >= 0) 242 { 243 CV_IMPL_ADD(CV_IMPL_IPP); 244 return; 245 } 246 setIppErrorStatus(); 247 } 248 #endif 249 250 int i = 0; 251 252 #if CV_SIMD128 253 for( ; i <= len - 8; i += 8 ) 254 { 255 v_float32x4 t0 = v_load(src + i), t1 = v_load(src + i + 4); 256 t0 = v_invsqrt(t0); 257 t1 = v_invsqrt(t1); 258 v_store(dst + i, t0); v_store(dst + i + 4, t1); 259 } 260 #endif 261 262 for( ; i < len; i++ ) 263 dst[i] = 1/std::sqrt(src[i]); 264 } 265 266 267 void invSqrt(const double* src, double* dst, int len) 268 { 269 int i = 0; 270 271 #if CV_SSE2 272 __m128d v_1 = _mm_set1_pd(1.0); 273 for ( ; i <= len - 2; i += 2) 274 _mm_storeu_pd(dst + i, _mm_div_pd(v_1, _mm_sqrt_pd(_mm_loadu_pd(src + i)))); 275 #endif 276 277 for( ; i < len; i++ ) 278 dst[i] = 1/std::sqrt(src[i]); 279 } 280 281 282 void sqrt(const float* src, float* dst, int len) 283 { 284 #if defined(HAVE_IPP) 285 CV_IPP_CHECK() 286 { 287 if (ippsSqrt_32f_A21(src, dst, len) >= 0) 288 { 289 CV_IMPL_ADD(CV_IMPL_IPP); 290 return; 291 } 292 setIppErrorStatus(); 293 } 294 #endif 295 296 int i = 0; 297 298 #if CV_SIMD128 299 for( ; i <= len - 8; i += 8 ) 300 { 301 v_float32x4 t0 = v_load(src + i), t1 = v_load(src + i + 4); 302 t0 = v_sqrt(t0); 303 t1 = v_sqrt(t1); 304 v_store(dst + i, t0); v_store(dst + i + 4, t1); 305 } 306 #endif 307 308 for( ; i < len; i++ ) 309 dst[i] = std::sqrt(src[i]); 310 } 311 312 313 void sqrt(const double* src, double* dst, int len) 314 { 315 #if defined(HAVE_IPP) 316 CV_IPP_CHECK() 317 { 318 if (ippsSqrt_64f_A50(src, dst, len) >= 0) 319 { 320 CV_IMPL_ADD(CV_IMPL_IPP); 321 return; 322 } 323 setIppErrorStatus(); 324 } 325 #endif 326 327 int i = 0; 328 329 #if CV_SIMD128_64F 330 for( ; i <= len - 4; i += 4 ) 331 { 332 v_float64x2 t0 = v_load(src + i), t1 = v_load(src + i + 2); 333 t0 = v_sqrt(t0); 334 t1 = v_sqrt(t1); 335 v_store(dst + i, t0); v_store(dst + i + 2, t1); 336 } 337 #endif 338 339 for( ; i < len; i++ ) 340 dst[i] = std::sqrt(src[i]); 341 } 342 343 ////////////////////////////////////// EXP ///////////////////////////////////// 344 345 typedef union 346 { 347 struct { 348 #if ( defined( WORDS_BIGENDIAN ) && !defined( OPENCV_UNIVERSAL_BUILD ) ) || defined( __BIG_ENDIAN__ ) 349 int hi; 350 int lo; 351 #else 352 int lo; 353 int hi; 354 #endif 355 } i; 356 double d; 357 } 358 DBLINT; 359 360 #define EXPTAB_SCALE 6 361 #define EXPTAB_MASK ((1 << EXPTAB_SCALE) - 1) 362 363 #define EXPPOLY_32F_A0 .9670371139572337719125840413672004409288e-2 364 365 static const double expTab[] = { 366 1.0 * EXPPOLY_32F_A0, 367 1.0108892860517004600204097905619 * EXPPOLY_32F_A0, 368 1.0218971486541166782344801347833 * EXPPOLY_32F_A0, 369 1.0330248790212284225001082839705 * EXPPOLY_32F_A0, 370 1.0442737824274138403219664787399 * EXPPOLY_32F_A0, 371 1.0556451783605571588083413251529 * EXPPOLY_32F_A0, 372 1.0671404006768236181695211209928 * EXPPOLY_32F_A0, 373 1.0787607977571197937406800374385 * EXPPOLY_32F_A0, 374 1.0905077326652576592070106557607 * EXPPOLY_32F_A0, 375 1.1023825833078409435564142094256 * EXPPOLY_32F_A0, 376 1.1143867425958925363088129569196 * EXPPOLY_32F_A0, 377 1.126521618608241899794798643787 * EXPPOLY_32F_A0, 378 1.1387886347566916537038302838415 * EXPPOLY_32F_A0, 379 1.151189229952982705817759635202 * EXPPOLY_32F_A0, 380 1.1637248587775775138135735990922 * EXPPOLY_32F_A0, 381 1.1763969916502812762846457284838 * EXPPOLY_32F_A0, 382 1.1892071150027210667174999705605 * EXPPOLY_32F_A0, 383 1.2021567314527031420963969574978 * EXPPOLY_32F_A0, 384 1.2152473599804688781165202513388 * EXPPOLY_32F_A0, 385 1.2284805361068700056940089577928 * EXPPOLY_32F_A0, 386 1.2418578120734840485936774687266 * EXPPOLY_32F_A0, 387 1.2553807570246910895793906574423 * EXPPOLY_32F_A0, 388 1.2690509571917332225544190810323 * EXPPOLY_32F_A0, 389 1.2828700160787782807266697810215 * EXPPOLY_32F_A0, 390 1.2968395546510096659337541177925 * EXPPOLY_32F_A0, 391 1.3109612115247643419229917863308 * EXPPOLY_32F_A0, 392 1.3252366431597412946295370954987 * EXPPOLY_32F_A0, 393 1.3396675240533030053600306697244 * EXPPOLY_32F_A0, 394 1.3542555469368927282980147401407 * EXPPOLY_32F_A0, 395 1.3690024229745906119296011329822 * EXPPOLY_32F_A0, 396 1.3839098819638319548726595272652 * EXPPOLY_32F_A0, 397 1.3989796725383111402095281367152 * EXPPOLY_32F_A0, 398 1.4142135623730950488016887242097 * EXPPOLY_32F_A0, 399 1.4296133383919700112350657782751 * EXPPOLY_32F_A0, 400 1.4451808069770466200370062414717 * EXPPOLY_32F_A0, 401 1.4609177941806469886513028903106 * EXPPOLY_32F_A0, 402 1.476826145939499311386907480374 * EXPPOLY_32F_A0, 403 1.4929077282912648492006435314867 * EXPPOLY_32F_A0, 404 1.5091644275934227397660195510332 * EXPPOLY_32F_A0, 405 1.5255981507445383068512536895169 * EXPPOLY_32F_A0, 406 1.5422108254079408236122918620907 * EXPPOLY_32F_A0, 407 1.5590044002378369670337280894749 * EXPPOLY_32F_A0, 408 1.5759808451078864864552701601819 * EXPPOLY_32F_A0, 409 1.5931421513422668979372486431191 * EXPPOLY_32F_A0, 410 1.6104903319492543081795206673574 * EXPPOLY_32F_A0, 411 1.628027421857347766848218522014 * EXPPOLY_32F_A0, 412 1.6457554781539648445187567247258 * EXPPOLY_32F_A0, 413 1.6636765803267364350463364569764 * EXPPOLY_32F_A0, 414 1.6817928305074290860622509524664 * EXPPOLY_32F_A0, 415 1.7001063537185234695013625734975 * EXPPOLY_32F_A0, 416 1.7186192981224779156293443764563 * EXPPOLY_32F_A0, 417 1.7373338352737062489942020818722 * EXPPOLY_32F_A0, 418 1.7562521603732994831121606193753 * EXPPOLY_32F_A0, 419 1.7753764925265212525505592001993 * EXPPOLY_32F_A0, 420 1.7947090750031071864277032421278 * EXPPOLY_32F_A0, 421 1.8142521755003987562498346003623 * EXPPOLY_32F_A0, 422 1.8340080864093424634870831895883 * EXPPOLY_32F_A0, 423 1.8539791250833855683924530703377 * EXPPOLY_32F_A0, 424 1.8741676341102999013299989499544 * EXPPOLY_32F_A0, 425 1.8945759815869656413402186534269 * EXPPOLY_32F_A0, 426 1.9152065613971472938726112702958 * EXPPOLY_32F_A0, 427 1.9360617934922944505980559045667 * EXPPOLY_32F_A0, 428 1.9571441241754002690183222516269 * EXPPOLY_32F_A0, 429 1.9784560263879509682582499181312 * EXPPOLY_32F_A0, 430 }; 431 432 433 // the code below uses _mm_cast* intrinsics, which are not avialable on VS2005 434 #if (defined _MSC_VER && _MSC_VER < 1500) || \ 435 (!defined __APPLE__ && defined __GNUC__ && __GNUC__*100 + __GNUC_MINOR__ < 402) 436 #undef CV_SSE2 437 #define CV_SSE2 0 438 #endif 439 440 static const double exp_prescale = 1.4426950408889634073599246810019 * (1 << EXPTAB_SCALE); 441 static const double exp_postscale = 1./(1 << EXPTAB_SCALE); 442 static const double exp_max_val = 3000.*(1 << EXPTAB_SCALE); // log10(DBL_MAX) < 3000 443 444 void exp( const float *_x, float *y, int n ) 445 { 446 static const float 447 A4 = (float)(1.000000000000002438532970795181890933776 / EXPPOLY_32F_A0), 448 A3 = (float)(.6931471805521448196800669615864773144641 / EXPPOLY_32F_A0), 449 A2 = (float)(.2402265109513301490103372422686535526573 / EXPPOLY_32F_A0), 450 A1 = (float)(.5550339366753125211915322047004666939128e-1 / EXPPOLY_32F_A0); 451 452 #undef EXPPOLY 453 #define EXPPOLY(x) \ 454 (((((x) + A1)*(x) + A2)*(x) + A3)*(x) + A4) 455 456 int i = 0; 457 const Cv32suf* x = (const Cv32suf*)_x; 458 Cv32suf buf[4]; 459 460 #if CV_SSE2 461 if( n >= 8 ) 462 { 463 static const __m128d prescale2 = _mm_set1_pd(exp_prescale); 464 static const __m128 postscale4 = _mm_set1_ps((float)exp_postscale); 465 static const __m128 maxval4 = _mm_set1_ps((float)(exp_max_val/exp_prescale)); 466 static const __m128 minval4 = _mm_set1_ps((float)(-exp_max_val/exp_prescale)); 467 468 static const __m128 mA1 = _mm_set1_ps(A1); 469 static const __m128 mA2 = _mm_set1_ps(A2); 470 static const __m128 mA3 = _mm_set1_ps(A3); 471 static const __m128 mA4 = _mm_set1_ps(A4); 472 bool y_aligned = (size_t)(void*)y % 16 == 0; 473 474 ushort CV_DECL_ALIGNED(16) tab_idx[8]; 475 476 for( ; i <= n - 8; i += 8 ) 477 { 478 __m128 xf0, xf1; 479 xf0 = _mm_loadu_ps(&x[i].f); 480 xf1 = _mm_loadu_ps(&x[i+4].f); 481 __m128i xi0, xi1, xi2, xi3; 482 483 xf0 = _mm_min_ps(_mm_max_ps(xf0, minval4), maxval4); 484 xf1 = _mm_min_ps(_mm_max_ps(xf1, minval4), maxval4); 485 486 __m128d xd0 = _mm_cvtps_pd(xf0); 487 __m128d xd2 = _mm_cvtps_pd(_mm_movehl_ps(xf0, xf0)); 488 __m128d xd1 = _mm_cvtps_pd(xf1); 489 __m128d xd3 = _mm_cvtps_pd(_mm_movehl_ps(xf1, xf1)); 490 491 xd0 = _mm_mul_pd(xd0, prescale2); 492 xd2 = _mm_mul_pd(xd2, prescale2); 493 xd1 = _mm_mul_pd(xd1, prescale2); 494 xd3 = _mm_mul_pd(xd3, prescale2); 495 496 xi0 = _mm_cvtpd_epi32(xd0); 497 xi2 = _mm_cvtpd_epi32(xd2); 498 499 xi1 = _mm_cvtpd_epi32(xd1); 500 xi3 = _mm_cvtpd_epi32(xd3); 501 502 xd0 = _mm_sub_pd(xd0, _mm_cvtepi32_pd(xi0)); 503 xd2 = _mm_sub_pd(xd2, _mm_cvtepi32_pd(xi2)); 504 xd1 = _mm_sub_pd(xd1, _mm_cvtepi32_pd(xi1)); 505 xd3 = _mm_sub_pd(xd3, _mm_cvtepi32_pd(xi3)); 506 507 xf0 = _mm_movelh_ps(_mm_cvtpd_ps(xd0), _mm_cvtpd_ps(xd2)); 508 xf1 = _mm_movelh_ps(_mm_cvtpd_ps(xd1), _mm_cvtpd_ps(xd3)); 509 510 xf0 = _mm_mul_ps(xf0, postscale4); 511 xf1 = _mm_mul_ps(xf1, postscale4); 512 513 xi0 = _mm_unpacklo_epi64(xi0, xi2); 514 xi1 = _mm_unpacklo_epi64(xi1, xi3); 515 xi0 = _mm_packs_epi32(xi0, xi1); 516 517 _mm_store_si128((__m128i*)tab_idx, _mm_and_si128(xi0, _mm_set1_epi16(EXPTAB_MASK))); 518 519 xi0 = _mm_add_epi16(_mm_srai_epi16(xi0, EXPTAB_SCALE), _mm_set1_epi16(127)); 520 xi0 = _mm_max_epi16(xi0, _mm_setzero_si128()); 521 xi0 = _mm_min_epi16(xi0, _mm_set1_epi16(255)); 522 xi1 = _mm_unpackhi_epi16(xi0, _mm_setzero_si128()); 523 xi0 = _mm_unpacklo_epi16(xi0, _mm_setzero_si128()); 524 525 __m128d yd0 = _mm_unpacklo_pd(_mm_load_sd(expTab + tab_idx[0]), _mm_load_sd(expTab + tab_idx[1])); 526 __m128d yd1 = _mm_unpacklo_pd(_mm_load_sd(expTab + tab_idx[2]), _mm_load_sd(expTab + tab_idx[3])); 527 __m128d yd2 = _mm_unpacklo_pd(_mm_load_sd(expTab + tab_idx[4]), _mm_load_sd(expTab + tab_idx[5])); 528 __m128d yd3 = _mm_unpacklo_pd(_mm_load_sd(expTab + tab_idx[6]), _mm_load_sd(expTab + tab_idx[7])); 529 530 __m128 yf0 = _mm_movelh_ps(_mm_cvtpd_ps(yd0), _mm_cvtpd_ps(yd1)); 531 __m128 yf1 = _mm_movelh_ps(_mm_cvtpd_ps(yd2), _mm_cvtpd_ps(yd3)); 532 533 yf0 = _mm_mul_ps(yf0, _mm_castsi128_ps(_mm_slli_epi32(xi0, 23))); 534 yf1 = _mm_mul_ps(yf1, _mm_castsi128_ps(_mm_slli_epi32(xi1, 23))); 535 536 __m128 zf0 = _mm_add_ps(xf0, mA1); 537 __m128 zf1 = _mm_add_ps(xf1, mA1); 538 539 zf0 = _mm_add_ps(_mm_mul_ps(zf0, xf0), mA2); 540 zf1 = _mm_add_ps(_mm_mul_ps(zf1, xf1), mA2); 541 542 zf0 = _mm_add_ps(_mm_mul_ps(zf0, xf0), mA3); 543 zf1 = _mm_add_ps(_mm_mul_ps(zf1, xf1), mA3); 544 545 zf0 = _mm_add_ps(_mm_mul_ps(zf0, xf0), mA4); 546 zf1 = _mm_add_ps(_mm_mul_ps(zf1, xf1), mA4); 547 548 zf0 = _mm_mul_ps(zf0, yf0); 549 zf1 = _mm_mul_ps(zf1, yf1); 550 551 if( y_aligned ) 552 { 553 _mm_store_ps(y + i, zf0); 554 _mm_store_ps(y + i + 4, zf1); 555 } 556 else 557 { 558 _mm_storeu_ps(y + i, zf0); 559 _mm_storeu_ps(y + i + 4, zf1); 560 } 561 } 562 } 563 else 564 #endif 565 for( ; i <= n - 4; i += 4 ) 566 { 567 double x0 = x[i].f * exp_prescale; 568 double x1 = x[i + 1].f * exp_prescale; 569 double x2 = x[i + 2].f * exp_prescale; 570 double x3 = x[i + 3].f * exp_prescale; 571 int val0, val1, val2, val3, t; 572 573 if( ((x[i].i >> 23) & 255) > 127 + 10 ) 574 x0 = x[i].i < 0 ? -exp_max_val : exp_max_val; 575 576 if( ((x[i+1].i >> 23) & 255) > 127 + 10 ) 577 x1 = x[i+1].i < 0 ? -exp_max_val : exp_max_val; 578 579 if( ((x[i+2].i >> 23) & 255) > 127 + 10 ) 580 x2 = x[i+2].i < 0 ? -exp_max_val : exp_max_val; 581 582 if( ((x[i+3].i >> 23) & 255) > 127 + 10 ) 583 x3 = x[i+3].i < 0 ? -exp_max_val : exp_max_val; 584 585 val0 = cvRound(x0); 586 val1 = cvRound(x1); 587 val2 = cvRound(x2); 588 val3 = cvRound(x3); 589 590 x0 = (x0 - val0)*exp_postscale; 591 x1 = (x1 - val1)*exp_postscale; 592 x2 = (x2 - val2)*exp_postscale; 593 x3 = (x3 - val3)*exp_postscale; 594 595 t = (val0 >> EXPTAB_SCALE) + 127; 596 t = !(t & ~255) ? t : t < 0 ? 0 : 255; 597 buf[0].i = t << 23; 598 599 t = (val1 >> EXPTAB_SCALE) + 127; 600 t = !(t & ~255) ? t : t < 0 ? 0 : 255; 601 buf[1].i = t << 23; 602 603 t = (val2 >> EXPTAB_SCALE) + 127; 604 t = !(t & ~255) ? t : t < 0 ? 0 : 255; 605 buf[2].i = t << 23; 606 607 t = (val3 >> EXPTAB_SCALE) + 127; 608 t = !(t & ~255) ? t : t < 0 ? 0 : 255; 609 buf[3].i = t << 23; 610 611 x0 = buf[0].f * expTab[val0 & EXPTAB_MASK] * EXPPOLY( x0 ); 612 x1 = buf[1].f * expTab[val1 & EXPTAB_MASK] * EXPPOLY( x1 ); 613 614 y[i] = (float)x0; 615 y[i + 1] = (float)x1; 616 617 x2 = buf[2].f * expTab[val2 & EXPTAB_MASK] * EXPPOLY( x2 ); 618 x3 = buf[3].f * expTab[val3 & EXPTAB_MASK] * EXPPOLY( x3 ); 619 620 y[i + 2] = (float)x2; 621 y[i + 3] = (float)x3; 622 } 623 624 for( ; i < n; i++ ) 625 { 626 double x0 = x[i].f * exp_prescale; 627 int val0, t; 628 629 if( ((x[i].i >> 23) & 255) > 127 + 10 ) 630 x0 = x[i].i < 0 ? -exp_max_val : exp_max_val; 631 632 val0 = cvRound(x0); 633 t = (val0 >> EXPTAB_SCALE) + 127; 634 t = !(t & ~255) ? t : t < 0 ? 0 : 255; 635 636 buf[0].i = t << 23; 637 x0 = (x0 - val0)*exp_postscale; 638 639 y[i] = (float)(buf[0].f * expTab[val0 & EXPTAB_MASK] * EXPPOLY(x0)); 640 } 641 } 642 643 void exp( const double *_x, double *y, int n ) 644 { 645 static const double 646 A5 = .99999999999999999998285227504999 / EXPPOLY_32F_A0, 647 A4 = .69314718055994546743029643825322 / EXPPOLY_32F_A0, 648 A3 = .24022650695886477918181338054308 / EXPPOLY_32F_A0, 649 A2 = .55504108793649567998466049042729e-1 / EXPPOLY_32F_A0, 650 A1 = .96180973140732918010002372686186e-2 / EXPPOLY_32F_A0, 651 A0 = .13369713757180123244806654839424e-2 / EXPPOLY_32F_A0; 652 653 #undef EXPPOLY 654 #define EXPPOLY(x) (((((A0*(x) + A1)*(x) + A2)*(x) + A3)*(x) + A4)*(x) + A5) 655 656 int i = 0; 657 Cv64suf buf[4]; 658 const Cv64suf* x = (const Cv64suf*)_x; 659 660 #if CV_SSE2 661 static const __m128d prescale2 = _mm_set1_pd(exp_prescale); 662 static const __m128d postscale2 = _mm_set1_pd(exp_postscale); 663 static const __m128d maxval2 = _mm_set1_pd(exp_max_val); 664 static const __m128d minval2 = _mm_set1_pd(-exp_max_val); 665 666 static const __m128d mA0 = _mm_set1_pd(A0); 667 static const __m128d mA1 = _mm_set1_pd(A1); 668 static const __m128d mA2 = _mm_set1_pd(A2); 669 static const __m128d mA3 = _mm_set1_pd(A3); 670 static const __m128d mA4 = _mm_set1_pd(A4); 671 static const __m128d mA5 = _mm_set1_pd(A5); 672 673 int CV_DECL_ALIGNED(16) tab_idx[4]; 674 675 for( ; i <= n - 4; i += 4 ) 676 { 677 __m128d xf0 = _mm_loadu_pd(&x[i].f), xf1 = _mm_loadu_pd(&x[i+2].f); 678 __m128i xi0, xi1; 679 xf0 = _mm_min_pd(_mm_max_pd(xf0, minval2), maxval2); 680 xf1 = _mm_min_pd(_mm_max_pd(xf1, minval2), maxval2); 681 xf0 = _mm_mul_pd(xf0, prescale2); 682 xf1 = _mm_mul_pd(xf1, prescale2); 683 684 xi0 = _mm_cvtpd_epi32(xf0); 685 xi1 = _mm_cvtpd_epi32(xf1); 686 xf0 = _mm_mul_pd(_mm_sub_pd(xf0, _mm_cvtepi32_pd(xi0)), postscale2); 687 xf1 = _mm_mul_pd(_mm_sub_pd(xf1, _mm_cvtepi32_pd(xi1)), postscale2); 688 689 xi0 = _mm_unpacklo_epi64(xi0, xi1); 690 _mm_store_si128((__m128i*)tab_idx, _mm_and_si128(xi0, _mm_set1_epi32(EXPTAB_MASK))); 691 692 xi0 = _mm_add_epi32(_mm_srai_epi32(xi0, EXPTAB_SCALE), _mm_set1_epi32(1023)); 693 xi0 = _mm_packs_epi32(xi0, xi0); 694 xi0 = _mm_max_epi16(xi0, _mm_setzero_si128()); 695 xi0 = _mm_min_epi16(xi0, _mm_set1_epi16(2047)); 696 xi0 = _mm_unpacklo_epi16(xi0, _mm_setzero_si128()); 697 xi1 = _mm_unpackhi_epi32(xi0, _mm_setzero_si128()); 698 xi0 = _mm_unpacklo_epi32(xi0, _mm_setzero_si128()); 699 700 __m128d yf0 = _mm_unpacklo_pd(_mm_load_sd(expTab + tab_idx[0]), _mm_load_sd(expTab + tab_idx[1])); 701 __m128d yf1 = _mm_unpacklo_pd(_mm_load_sd(expTab + tab_idx[2]), _mm_load_sd(expTab + tab_idx[3])); 702 yf0 = _mm_mul_pd(yf0, _mm_castsi128_pd(_mm_slli_epi64(xi0, 52))); 703 yf1 = _mm_mul_pd(yf1, _mm_castsi128_pd(_mm_slli_epi64(xi1, 52))); 704 705 __m128d zf0 = _mm_add_pd(_mm_mul_pd(mA0, xf0), mA1); 706 __m128d zf1 = _mm_add_pd(_mm_mul_pd(mA0, xf1), mA1); 707 708 zf0 = _mm_add_pd(_mm_mul_pd(zf0, xf0), mA2); 709 zf1 = _mm_add_pd(_mm_mul_pd(zf1, xf1), mA2); 710 711 zf0 = _mm_add_pd(_mm_mul_pd(zf0, xf0), mA3); 712 zf1 = _mm_add_pd(_mm_mul_pd(zf1, xf1), mA3); 713 714 zf0 = _mm_add_pd(_mm_mul_pd(zf0, xf0), mA4); 715 zf1 = _mm_add_pd(_mm_mul_pd(zf1, xf1), mA4); 716 717 zf0 = _mm_add_pd(_mm_mul_pd(zf0, xf0), mA5); 718 zf1 = _mm_add_pd(_mm_mul_pd(zf1, xf1), mA5); 719 720 zf0 = _mm_mul_pd(zf0, yf0); 721 zf1 = _mm_mul_pd(zf1, yf1); 722 723 _mm_storeu_pd(y + i, zf0); 724 _mm_storeu_pd(y + i + 2, zf1); 725 } 726 #endif 727 for( ; i <= n - 4; i += 4 ) 728 { 729 double x0 = x[i].f * exp_prescale; 730 double x1 = x[i + 1].f * exp_prescale; 731 double x2 = x[i + 2].f * exp_prescale; 732 double x3 = x[i + 3].f * exp_prescale; 733 734 double y0, y1, y2, y3; 735 int val0, val1, val2, val3, t; 736 737 t = (int)(x[i].i >> 52); 738 if( (t & 2047) > 1023 + 10 ) 739 x0 = t < 0 ? -exp_max_val : exp_max_val; 740 741 t = (int)(x[i+1].i >> 52); 742 if( (t & 2047) > 1023 + 10 ) 743 x1 = t < 0 ? -exp_max_val : exp_max_val; 744 745 t = (int)(x[i+2].i >> 52); 746 if( (t & 2047) > 1023 + 10 ) 747 x2 = t < 0 ? -exp_max_val : exp_max_val; 748 749 t = (int)(x[i+3].i >> 52); 750 if( (t & 2047) > 1023 + 10 ) 751 x3 = t < 0 ? -exp_max_val : exp_max_val; 752 753 val0 = cvRound(x0); 754 val1 = cvRound(x1); 755 val2 = cvRound(x2); 756 val3 = cvRound(x3); 757 758 x0 = (x0 - val0)*exp_postscale; 759 x1 = (x1 - val1)*exp_postscale; 760 x2 = (x2 - val2)*exp_postscale; 761 x3 = (x3 - val3)*exp_postscale; 762 763 t = (val0 >> EXPTAB_SCALE) + 1023; 764 t = !(t & ~2047) ? t : t < 0 ? 0 : 2047; 765 buf[0].i = (int64)t << 52; 766 767 t = (val1 >> EXPTAB_SCALE) + 1023; 768 t = !(t & ~2047) ? t : t < 0 ? 0 : 2047; 769 buf[1].i = (int64)t << 52; 770 771 t = (val2 >> EXPTAB_SCALE) + 1023; 772 t = !(t & ~2047) ? t : t < 0 ? 0 : 2047; 773 buf[2].i = (int64)t << 52; 774 775 t = (val3 >> EXPTAB_SCALE) + 1023; 776 t = !(t & ~2047) ? t : t < 0 ? 0 : 2047; 777 buf[3].i = (int64)t << 52; 778 779 y0 = buf[0].f * expTab[val0 & EXPTAB_MASK] * EXPPOLY( x0 ); 780 y1 = buf[1].f * expTab[val1 & EXPTAB_MASK] * EXPPOLY( x1 ); 781 782 y[i] = y0; 783 y[i + 1] = y1; 784 785 y2 = buf[2].f * expTab[val2 & EXPTAB_MASK] * EXPPOLY( x2 ); 786 y3 = buf[3].f * expTab[val3 & EXPTAB_MASK] * EXPPOLY( x3 ); 787 788 y[i + 2] = y2; 789 y[i + 3] = y3; 790 } 791 792 for( ; i < n; i++ ) 793 { 794 double x0 = x[i].f * exp_prescale; 795 int val0, t; 796 797 t = (int)(x[i].i >> 52); 798 if( (t & 2047) > 1023 + 10 ) 799 x0 = t < 0 ? -exp_max_val : exp_max_val; 800 801 val0 = cvRound(x0); 802 t = (val0 >> EXPTAB_SCALE) + 1023; 803 t = !(t & ~2047) ? t : t < 0 ? 0 : 2047; 804 805 buf[0].i = (int64)t << 52; 806 x0 = (x0 - val0)*exp_postscale; 807 808 y[i] = buf[0].f * expTab[val0 & EXPTAB_MASK] * EXPPOLY( x0 ); 809 } 810 } 811 812 #undef EXPTAB_SCALE 813 #undef EXPTAB_MASK 814 #undef EXPPOLY_32F_A0 815 816 /////////////////////////////////////////// LOG /////////////////////////////////////// 817 818 #define LOGTAB_SCALE 8 819 #define LOGTAB_MASK ((1 << LOGTAB_SCALE) - 1) 820 #define LOGTAB_MASK2 ((1 << (20 - LOGTAB_SCALE)) - 1) 821 #define LOGTAB_MASK2_32F ((1 << (23 - LOGTAB_SCALE)) - 1) 822 823 static const double CV_DECL_ALIGNED(16) icvLogTab[] = { 824 0.0000000000000000000000000000000000000000, 1.000000000000000000000000000000000000000, 825 .00389864041565732288852075271279318258166, .9961089494163424124513618677042801556420, 826 .00778214044205494809292034119607706088573, .9922480620155038759689922480620155038760, 827 .01165061721997527263705585198749759001657, .9884169884169884169884169884169884169884, 828 .01550418653596525274396267235488267033361, .9846153846153846153846153846153846153846, 829 .01934296284313093139406447562578250654042, .9808429118773946360153256704980842911877, 830 .02316705928153437593630670221500622574241, .9770992366412213740458015267175572519084, 831 .02697658769820207233514075539915211265906, .9733840304182509505703422053231939163498, 832 .03077165866675368732785500469617545604706, .9696969696969696969696969696969696969697, 833 .03455238150665972812758397481047722976656, .9660377358490566037735849056603773584906, 834 .03831886430213659461285757856785494368522, .9624060150375939849624060150375939849624, 835 .04207121392068705056921373852674150839447, .9588014981273408239700374531835205992509, 836 .04580953603129420126371940114040626212953, .9552238805970149253731343283582089552239, 837 .04953393512227662748292900118940451648088, .9516728624535315985130111524163568773234, 838 .05324451451881227759255210685296333394944, .9481481481481481481481481481481481481481, 839 .05694137640013842427411105973078520037234, .9446494464944649446494464944649446494465, 840 .06062462181643483993820353816772694699466, .9411764705882352941176470588235294117647, 841 .06429435070539725460836422143984236754475, .9377289377289377289377289377289377289377, 842 .06795066190850773679699159401934593915938, .9343065693430656934306569343065693430657, 843 .07159365318700880442825962290953611955044, .9309090909090909090909090909090909090909, 844 .07522342123758751775142172846244648098944, .9275362318840579710144927536231884057971, 845 .07884006170777602129362549021607264876369, .9241877256317689530685920577617328519856, 846 .08244366921107458556772229485432035289706, .9208633093525179856115107913669064748201, 847 .08603433734180314373940490213499288074675, .9175627240143369175627240143369175627240, 848 .08961215868968712416897659522874164395031, .9142857142857142857142857142857142857143, 849 .09317722485418328259854092721070628613231, .9110320284697508896797153024911032028470, 850 .09672962645855109897752299730200320482256, .9078014184397163120567375886524822695035, 851 .10026945316367513738597949668474029749630, .9045936395759717314487632508833922261484, 852 .10379679368164355934833764649738441221420, .9014084507042253521126760563380281690141, 853 .10731173578908805021914218968959175981580, .8982456140350877192982456140350877192982, 854 .11081436634029011301105782649756292812530, .8951048951048951048951048951048951048951, 855 .11430477128005862852422325204315711744130, .8919860627177700348432055749128919860627, 856 .11778303565638344185817487641543266363440, .8888888888888888888888888888888888888889, 857 .12124924363286967987640707633545389398930, .8858131487889273356401384083044982698962, 858 .12470347850095722663787967121606925502420, .8827586206896551724137931034482758620690, 859 .12814582269193003360996385708858724683530, .8797250859106529209621993127147766323024, 860 .13157635778871926146571524895989568904040, .8767123287671232876712328767123287671233, 861 .13499516453750481925766280255629681050780, .8737201365187713310580204778156996587031, 862 .13840232285911913123754857224412262439730, .8707482993197278911564625850340136054422, 863 .14179791186025733629172407290752744302150, .8677966101694915254237288135593220338983, 864 .14518200984449788903951628071808954700830, .8648648648648648648648648648648648648649, 865 .14855469432313711530824207329715136438610, .8619528619528619528619528619528619528620, 866 .15191604202584196858794030049466527998450, .8590604026845637583892617449664429530201, 867 .15526612891112392955683674244937719777230, .8561872909698996655518394648829431438127, 868 .15860503017663857283636730244325008243330, .8533333333333333333333333333333333333333, 869 .16193282026931324346641360989451641216880, .8504983388704318936877076411960132890365, 870 .16524957289530714521497145597095368430010, .8476821192052980132450331125827814569536, 871 .16855536102980664403538924034364754334090, .8448844884488448844884488448844884488449, 872 .17185025692665920060697715143760433420540, .8421052631578947368421052631578947368421, 873 .17513433212784912385018287750426679849630, .8393442622950819672131147540983606557377, 874 .17840765747281828179637841458315961062910, .8366013071895424836601307189542483660131, 875 .18167030310763465639212199675966985523700, .8338762214983713355048859934853420195440, 876 .18492233849401198964024217730184318497780, .8311688311688311688311688311688311688312, 877 .18816383241818296356839823602058459073300, .8284789644012944983818770226537216828479, 878 .19139485299962943898322009772527962923050, .8258064516129032258064516129032258064516, 879 .19461546769967164038916962454095482826240, .8231511254019292604501607717041800643087, 880 .19782574332991986754137769821682013571260, .8205128205128205128205128205128205128205, 881 .20102574606059073203390141770796617493040, .8178913738019169329073482428115015974441, 882 .20421554142869088876999228432396193966280, .8152866242038216560509554140127388535032, 883 .20739519434607056602715147164417430758480, .8126984126984126984126984126984126984127, 884 .21056476910734961416338251183333341032260, .8101265822784810126582278481012658227848, 885 .21372432939771812687723695489694364368910, .8075709779179810725552050473186119873817, 886 .21687393830061435506806333251006435602900, .8050314465408805031446540880503144654088, 887 .22001365830528207823135744547471404075630, .8025078369905956112852664576802507836991, 888 .22314355131420973710199007200571941211830, .8000000000000000000000000000000000000000, 889 .22626367865045338145790765338460914790630, .7975077881619937694704049844236760124611, 890 .22937410106484582006380890106811420992010, .7950310559006211180124223602484472049689, 891 .23247487874309405442296849741978803649550, .7925696594427244582043343653250773993808, 892 .23556607131276688371634975283086532726890, .7901234567901234567901234567901234567901, 893 .23864773785017498464178231643018079921600, .7876923076923076923076923076923076923077, 894 .24171993688714515924331749374687206000090, .7852760736196319018404907975460122699387, 895 .24478272641769091566565919038112042471760, .7828746177370030581039755351681957186544, 896 .24783616390458124145723672882013488560910, .7804878048780487804878048780487804878049, 897 .25088030628580937353433455427875742316250, .7781155015197568389057750759878419452888, 898 .25391520998096339667426946107298135757450, .7757575757575757575757575757575757575758, 899 .25694093089750041913887912414793390780680, .7734138972809667673716012084592145015106, 900 .25995752443692604627401010475296061486000, .7710843373493975903614457831325301204819, 901 .26296504550088134477547896494797896593800, .7687687687687687687687687687687687687688, 902 .26596354849713793599974565040611196309330, .7664670658682634730538922155688622754491, 903 .26895308734550393836570947314612567424780, .7641791044776119402985074626865671641791, 904 .27193371548364175804834985683555714786050, .7619047619047619047619047619047619047619, 905 .27490548587279922676529508862586226314300, .7596439169139465875370919881305637982196, 906 .27786845100345625159121709657483734190480, .7573964497041420118343195266272189349112, 907 .28082266290088775395616949026589281857030, .7551622418879056047197640117994100294985, 908 .28376817313064456316240580235898960381750, .7529411764705882352941176470588235294118, 909 .28670503280395426282112225635501090437180, .7507331378299120234604105571847507331378, 910 .28963329258304265634293983566749375313530, .7485380116959064327485380116959064327485, 911 .29255300268637740579436012922087684273730, .7463556851311953352769679300291545189504, 912 .29546421289383584252163927885703742504130, .7441860465116279069767441860465116279070, 913 .29836697255179722709783618483925238251680, .7420289855072463768115942028985507246377, 914 .30126133057816173455023545102449133992200, .7398843930635838150289017341040462427746, 915 .30414733546729666446850615102448500692850, .7377521613832853025936599423631123919308, 916 .30702503529491181888388950937951449304830, .7356321839080459770114942528735632183908, 917 .30989447772286465854207904158101882785550, .7335243553008595988538681948424068767908, 918 .31275571000389684739317885942000430077330, .7314285714285714285714285714285714285714, 919 .31560877898630329552176476681779604405180, .7293447293447293447293447293447293447293, 920 .31845373111853458869546784626436419785030, .7272727272727272727272727272727272727273, 921 .32129061245373424782201254856772720813750, .7252124645892351274787535410764872521246, 922 .32411946865421192853773391107097268104550, .7231638418079096045197740112994350282486, 923 .32694034499585328257253991068864706903700, .7211267605633802816901408450704225352113, 924 .32975328637246797969240219572384376078850, .7191011235955056179775280898876404494382, 925 .33255833730007655635318997155991382896900, .7170868347338935574229691876750700280112, 926 .33535554192113781191153520921943709254280, .7150837988826815642458100558659217877095, 927 .33814494400871636381467055798566434532400, .7130919220055710306406685236768802228412, 928 .34092658697059319283795275623560883104800, .7111111111111111111111111111111111111111, 929 .34370051385331840121395430287520866841080, .7091412742382271468144044321329639889197, 930 .34646676734620857063262633346312213689100, .7071823204419889502762430939226519337017, 931 .34922538978528827602332285096053965389730, .7052341597796143250688705234159779614325, 932 .35197642315717814209818925519357435405250, .7032967032967032967032967032967032967033, 933 .35471990910292899856770532096561510115850, .7013698630136986301369863013698630136986, 934 .35745588892180374385176833129662554711100, .6994535519125683060109289617486338797814, 935 .36018440357500774995358483465679455548530, .6975476839237057220708446866485013623978, 936 .36290549368936841911903457003063522279280, .6956521739130434782608695652173913043478, 937 .36561919956096466943762379742111079394830, .6937669376693766937669376693766937669377, 938 .36832556115870762614150635272380895912650, .6918918918918918918918918918918918918919, 939 .37102461812787262962487488948681857436900, .6900269541778975741239892183288409703504, 940 .37371640979358405898480555151763837784530, .6881720430107526881720430107526881720430, 941 .37640097516425302659470730759494472295050, .6863270777479892761394101876675603217158, 942 .37907835293496944251145919224654790014030, .6844919786096256684491978609625668449198, 943 .38174858149084833769393299007788300514230, .6826666666666666666666666666666666666667, 944 .38441169891033200034513583887019194662580, .6808510638297872340425531914893617021277, 945 .38706774296844825844488013899535872042180, .6790450928381962864721485411140583554377, 946 .38971675114002518602873692543653305619950, .6772486772486772486772486772486772486772, 947 .39235876060286384303665840889152605086580, .6754617414248021108179419525065963060686, 948 .39499380824086893770896722344332374632350, .6736842105263157894736842105263157894737, 949 .39762193064713846624158577469643205404280, .6719160104986876640419947506561679790026, 950 .40024316412701266276741307592601515352730, .6701570680628272251308900523560209424084, 951 .40285754470108348090917615991202183067800, .6684073107049608355091383812010443864230, 952 .40546510810816432934799991016916465014230, .6666666666666666666666666666666666666667, 953 .40806588980822172674223224930756259709600, .6649350649350649350649350649350649350649, 954 .41065992498526837639616360320360399782650, .6632124352331606217616580310880829015544, 955 .41324724855021932601317757871584035456180, .6614987080103359173126614987080103359173, 956 .41582789514371093497757669865677598863850, .6597938144329896907216494845360824742268, 957 .41840189913888381489925905043492093682300, .6580976863753213367609254498714652956298, 958 .42096929464412963239894338585145305842150, .6564102564102564102564102564102564102564, 959 .42353011550580327293502591601281892508280, .6547314578005115089514066496163682864450, 960 .42608439531090003260516141381231136620050, .6530612244897959183673469387755102040816, 961 .42863216738969872610098832410585600882780, .6513994910941475826972010178117048346056, 962 .43117346481837132143866142541810404509300, .6497461928934010152284263959390862944162, 963 .43370832042155937902094819946796633303180, .6481012658227848101265822784810126582278, 964 .43623676677491801667585491486534010618930, .6464646464646464646464646464646464646465, 965 .43875883620762790027214350629947148263450, .6448362720403022670025188916876574307305, 966 .44127456080487520440058801796112675219780, .6432160804020100502512562814070351758794, 967 .44378397241030093089975139264424797147500, .6416040100250626566416040100250626566416, 968 .44628710262841947420398014401143882423650, .6400000000000000000000000000000000000000, 969 .44878398282700665555822183705458883196130, .6384039900249376558603491271820448877805, 970 .45127464413945855836729492693848442286250, .6368159203980099502487562189054726368159, 971 .45375911746712049854579618113348260521900, .6352357320099255583126550868486352357320, 972 .45623743348158757315857769754074979573500, .6336633663366336633663366336633663366337, 973 .45870962262697662081833982483658473938700, .6320987654320987654320987654320987654321, 974 .46117571512217014895185229761409573256980, .6305418719211822660098522167487684729064, 975 .46363574096303250549055974261136725544930, .6289926289926289926289926289926289926290, 976 .46608972992459918316399125615134835243230, .6274509803921568627450980392156862745098, 977 .46853771156323925639597405279346276074650, .6259168704156479217603911980440097799511, 978 .47097971521879100631480241645476780831830, .6243902439024390243902439024390243902439, 979 .47341577001667212165614273544633761048330, .6228710462287104622871046228710462287105, 980 .47584590486996386493601107758877333253630, .6213592233009708737864077669902912621359, 981 .47827014848147025860569669930555392056700, .6198547215496368038740920096852300242131, 982 .48068852934575190261057286988943815231330, .6183574879227053140096618357487922705314, 983 .48310107575113581113157579238759353756900, .6168674698795180722891566265060240963855, 984 .48550781578170076890899053978500887751580, .6153846153846153846153846153846153846154, 985 .48790877731923892879351001283794175833480, .6139088729016786570743405275779376498801, 986 .49030398804519381705802061333088204264650, .6124401913875598086124401913875598086124, 987 .49269347544257524607047571407747454941280, .6109785202863961813842482100238663484487, 988 .49507726679785146739476431321236304938800, .6095238095238095238095238095238095238095, 989 .49745538920281889838648226032091770321130, .6080760095011876484560570071258907363420, 990 .49982786955644931126130359189119189977650, .6066350710900473933649289099526066350711, 991 .50219473456671548383667413872899487614650, .6052009456264775413711583924349881796690, 992 .50455601075239520092452494282042607665050, .6037735849056603773584905660377358490566, 993 .50691172444485432801997148999362252652650, .6023529411764705882352941176470588235294, 994 .50926190178980790257412536448100581765150, .6009389671361502347417840375586854460094, 995 .51160656874906207391973111953120678663250, .5995316159250585480093676814988290398126, 996 .51394575110223428282552049495279788970950, .5981308411214953271028037383177570093458, 997 .51627947444845445623684554448118433356300, .5967365967365967365967365967365967365967, 998 .51860776420804555186805373523384332656850, .5953488372093023255813953488372093023256, 999 .52093064562418522900344441950437612831600, .5939675174013921113689095127610208816705, 1000 .52324814376454775732838697877014055848100, .5925925925925925925925925925925925925926, 1001 .52556028352292727401362526507000438869000, .5912240184757505773672055427251732101617, 1002 .52786708962084227803046587723656557500350, .5898617511520737327188940092165898617512, 1003 .53016858660912158374145519701414741575700, .5885057471264367816091954022988505747126, 1004 .53246479886947173376654518506256863474850, .5871559633027522935779816513761467889908, 1005 .53475575061602764748158733709715306758900, .5858123569794050343249427917620137299771, 1006 .53704146589688361856929077475797384977350, .5844748858447488584474885844748858447489, 1007 .53932196859560876944783558428753167390800, .5831435079726651480637813211845102505695, 1008 .54159728243274429804188230264117009937750, .5818181818181818181818181818181818181818, 1009 .54386743096728351609669971367111429572100, .5804988662131519274376417233560090702948, 1010 .54613243759813556721383065450936555862450, .5791855203619909502262443438914027149321, 1011 .54839232556557315767520321969641372561450, .5778781038374717832957110609480812641084, 1012 .55064711795266219063194057525834068655950, .5765765765765765765765765765765765765766, 1013 .55289683768667763352766542084282264113450, .5752808988764044943820224719101123595506, 1014 .55514150754050151093110798683483153581600, .5739910313901345291479820627802690582960, 1015 .55738115013400635344709144192165695130850, .5727069351230425055928411633109619686801, 1016 .55961578793542265941596269840374588966350, .5714285714285714285714285714285714285714, 1017 .56184544326269181269140062795486301183700, .5701559020044543429844097995545657015590, 1018 .56407013828480290218436721261241473257550, .5688888888888888888888888888888888888889, 1019 .56628989502311577464155334382667206227800, .5676274944567627494456762749445676274945, 1020 .56850473535266865532378233183408156037350, .5663716814159292035398230088495575221239, 1021 .57071468100347144680739575051120482385150, .5651214128035320088300220750551876379691, 1022 .57291975356178548306473885531886480748650, .5638766519823788546255506607929515418502, 1023 .57511997447138785144460371157038025558000, .5626373626373626373626373626373626373626, 1024 .57731536503482350219940144597785547375700, .5614035087719298245614035087719298245614, 1025 .57950594641464214795689713355386629700650, .5601750547045951859956236323851203501094, 1026 .58169173963462239562716149521293118596100, .5589519650655021834061135371179039301310, 1027 .58387276558098266665552955601015128195300, .5577342047930283224400871459694989106754, 1028 .58604904500357812846544902640744112432000, .5565217391304347826086956521739130434783, 1029 .58822059851708596855957011939608491957200, .5553145336225596529284164859002169197397, 1030 .59038744660217634674381770309992134571100, .5541125541125541125541125541125541125541, 1031 .59254960960667157898740242671919986605650, .5529157667386609071274298056155507559395, 1032 .59470710774669277576265358220553025603300, .5517241379310344827586206896551724137931, 1033 .59685996110779382384237123915227130055450, .5505376344086021505376344086021505376344, 1034 .59900818964608337768851242799428291618800, .5493562231759656652360515021459227467811, 1035 .60115181318933474940990890900138765573500, .5481798715203426124197002141327623126338, 1036 .60329085143808425240052883964381180703650, .5470085470085470085470085470085470085470, 1037 .60542532396671688843525771517306566238400, .5458422174840085287846481876332622601279, 1038 .60755525022454170969155029524699784815300, .5446808510638297872340425531914893617021, 1039 .60968064953685519036241657886421307921400, .5435244161358811040339702760084925690021, 1040 .61180154110599282990534675263916142284850, .5423728813559322033898305084745762711864, 1041 .61391794401237043121710712512140162289150, .5412262156448202959830866807610993657505, 1042 .61602987721551394351138242200249806046500, .5400843881856540084388185654008438818565, 1043 .61813735955507864705538167982012964785100, .5389473684210526315789473684210526315789, 1044 .62024040975185745772080281312810257077200, .5378151260504201680672268907563025210084, 1045 .62233904640877868441606324267922900617100, .5366876310272536687631027253668763102725, 1046 .62443328801189346144440150965237990021700, .5355648535564853556485355648535564853556, 1047 .62652315293135274476554741340805776417250, .5344467640918580375782881002087682672234, 1048 .62860865942237409420556559780379757285100, .5333333333333333333333333333333333333333, 1049 .63068982562619868570408243613201193511500, .5322245322245322245322245322245322245322, 1050 .63276666957103777644277897707070223987100, .5311203319502074688796680497925311203320, 1051 .63483920917301017716738442686619237065300, .5300207039337474120082815734989648033126, 1052 .63690746223706917739093569252872839570050, .5289256198347107438016528925619834710744, 1053 .63897144645792069983514238629140891134750, .5278350515463917525773195876288659793814, 1054 .64103117942093124081992527862894348800200, .5267489711934156378600823045267489711934, 1055 .64308667860302726193566513757104985415950, .5256673511293634496919917864476386036961, 1056 .64513796137358470073053240412264131009600, .5245901639344262295081967213114754098361, 1057 .64718504499530948859131740391603671014300, .5235173824130879345603271983640081799591, 1058 .64922794662510974195157587018911726772800, .5224489795918367346938775510204081632653, 1059 .65126668331495807251485530287027359008800, .5213849287169042769857433808553971486762, 1060 .65330127201274557080523663898929953575150, .5203252032520325203252032520325203252033, 1061 .65533172956312757406749369692988693714150, .5192697768762677484787018255578093306288, 1062 .65735807270835999727154330685152672231200, .5182186234817813765182186234817813765182, 1063 .65938031808912778153342060249997302889800, .5171717171717171717171717171717171717172, 1064 .66139848224536490484126716182800009846700, .5161290322580645161290322580645161290323, 1065 .66341258161706617713093692145776003599150, .5150905432595573440643863179074446680080, 1066 .66542263254509037562201001492212526500250, .5140562248995983935742971887550200803213, 1067 .66742865127195616370414654738851822912700, .5130260521042084168336673346693386773547, 1068 .66943065394262923906154583164607174694550, .5120000000000000000000000000000000000000, 1069 .67142865660530226534774556057527661323550, .5109780439121756487025948103792415169661, 1070 .67342267521216669923234121597488410770900, .5099601593625498007968127490039840637450, 1071 .67541272562017662384192817626171745359900, .5089463220675944333996023856858846918489, 1072 .67739882359180603188519853574689477682100, .5079365079365079365079365079365079365079, 1073 .67938098479579733801614338517538271844400, .5069306930693069306930693069306930693069, 1074 .68135922480790300781450241629499942064300, .5059288537549407114624505928853754940711, 1075 .68333355911162063645036823800182901322850, .5049309664694280078895463510848126232742, 1076 .68530400309891936760919861626462079584600, .5039370078740157480314960629921259842520, 1077 .68727057207096020619019327568821609020250, .5029469548133595284872298624754420432220, 1078 .68923328123880889251040571252815425395950, .5019607843137254901960784313725490196078, 1079 .69314718055994530941723212145818, 5.0e-01, 1080 }; 1081 1082 1083 1084 #define LOGTAB_TRANSLATE(x,h) (((x) - 1.)*icvLogTab[(h)+1]) 1085 static const double ln_2 = 0.69314718055994530941723212145818; 1086 1087 void log( const float *_x, float *y, int n ) 1088 { 1089 static const float shift[] = { 0, -1.f/512 }; 1090 static const float 1091 A0 = 0.3333333333333333333333333f, 1092 A1 = -0.5f, 1093 A2 = 1.f; 1094 1095 #undef LOGPOLY 1096 #define LOGPOLY(x) (((A0*(x) + A1)*(x) + A2)*(x)) 1097 1098 int i = 0; 1099 Cv32suf buf[4]; 1100 const int* x = (const int*)_x; 1101 1102 #if CV_SSE2 1103 static const __m128d ln2_2 = _mm_set1_pd(ln_2); 1104 static const __m128 _1_4 = _mm_set1_ps(1.f); 1105 static const __m128 shift4 = _mm_set1_ps(-1.f/512); 1106 1107 static const __m128 mA0 = _mm_set1_ps(A0); 1108 static const __m128 mA1 = _mm_set1_ps(A1); 1109 static const __m128 mA2 = _mm_set1_ps(A2); 1110 1111 int CV_DECL_ALIGNED(16) idx[4]; 1112 1113 for( ; i <= n - 4; i += 4 ) 1114 { 1115 __m128i h0 = _mm_loadu_si128((const __m128i*)(x + i)); 1116 __m128i yi0 = _mm_sub_epi32(_mm_and_si128(_mm_srli_epi32(h0, 23), _mm_set1_epi32(255)), _mm_set1_epi32(127)); 1117 __m128d yd0 = _mm_mul_pd(_mm_cvtepi32_pd(yi0), ln2_2); 1118 __m128d yd1 = _mm_mul_pd(_mm_cvtepi32_pd(_mm_unpackhi_epi64(yi0,yi0)), ln2_2); 1119 1120 __m128i xi0 = _mm_or_si128(_mm_and_si128(h0, _mm_set1_epi32(LOGTAB_MASK2_32F)), _mm_set1_epi32(127 << 23)); 1121 1122 h0 = _mm_and_si128(_mm_srli_epi32(h0, 23 - LOGTAB_SCALE - 1), _mm_set1_epi32(LOGTAB_MASK*2)); 1123 _mm_store_si128((__m128i*)idx, h0); 1124 h0 = _mm_cmpeq_epi32(h0, _mm_set1_epi32(510)); 1125 1126 __m128d t0, t1, t2, t3, t4; 1127 t0 = _mm_load_pd(icvLogTab + idx[0]); 1128 t2 = _mm_load_pd(icvLogTab + idx[1]); 1129 t1 = _mm_unpackhi_pd(t0, t2); 1130 t0 = _mm_unpacklo_pd(t0, t2); 1131 t2 = _mm_load_pd(icvLogTab + idx[2]); 1132 t4 = _mm_load_pd(icvLogTab + idx[3]); 1133 t3 = _mm_unpackhi_pd(t2, t4); 1134 t2 = _mm_unpacklo_pd(t2, t4); 1135 1136 yd0 = _mm_add_pd(yd0, t0); 1137 yd1 = _mm_add_pd(yd1, t2); 1138 1139 __m128 yf0 = _mm_movelh_ps(_mm_cvtpd_ps(yd0), _mm_cvtpd_ps(yd1)); 1140 1141 __m128 xf0 = _mm_sub_ps(_mm_castsi128_ps(xi0), _1_4); 1142 xf0 = _mm_mul_ps(xf0, _mm_movelh_ps(_mm_cvtpd_ps(t1), _mm_cvtpd_ps(t3))); 1143 xf0 = _mm_add_ps(xf0, _mm_and_ps(_mm_castsi128_ps(h0), shift4)); 1144 1145 __m128 zf0 = _mm_mul_ps(xf0, mA0); 1146 zf0 = _mm_mul_ps(_mm_add_ps(zf0, mA1), xf0); 1147 zf0 = _mm_mul_ps(_mm_add_ps(zf0, mA2), xf0); 1148 yf0 = _mm_add_ps(yf0, zf0); 1149 1150 _mm_storeu_ps(y + i, yf0); 1151 } 1152 #endif 1153 for( ; i <= n - 4; i += 4 ) 1154 { 1155 double x0, x1, x2, x3; 1156 double y0, y1, y2, y3; 1157 int h0, h1, h2, h3; 1158 1159 h0 = x[i]; 1160 h1 = x[i+1]; 1161 buf[0].i = (h0 & LOGTAB_MASK2_32F) | (127 << 23); 1162 buf[1].i = (h1 & LOGTAB_MASK2_32F) | (127 << 23); 1163 1164 y0 = (((h0 >> 23) & 0xff) - 127) * ln_2; 1165 y1 = (((h1 >> 23) & 0xff) - 127) * ln_2; 1166 1167 h0 = (h0 >> (23 - LOGTAB_SCALE - 1)) & LOGTAB_MASK * 2; 1168 h1 = (h1 >> (23 - LOGTAB_SCALE - 1)) & LOGTAB_MASK * 2; 1169 1170 y0 += icvLogTab[h0]; 1171 y1 += icvLogTab[h1]; 1172 1173 h2 = x[i+2]; 1174 h3 = x[i+3]; 1175 1176 x0 = LOGTAB_TRANSLATE( buf[0].f, h0 ); 1177 x1 = LOGTAB_TRANSLATE( buf[1].f, h1 ); 1178 1179 buf[2].i = (h2 & LOGTAB_MASK2_32F) | (127 << 23); 1180 buf[3].i = (h3 & LOGTAB_MASK2_32F) | (127 << 23); 1181 1182 y2 = (((h2 >> 23) & 0xff) - 127) * ln_2; 1183 y3 = (((h3 >> 23) & 0xff) - 127) * ln_2; 1184 1185 h2 = (h2 >> (23 - LOGTAB_SCALE - 1)) & LOGTAB_MASK * 2; 1186 h3 = (h3 >> (23 - LOGTAB_SCALE - 1)) & LOGTAB_MASK * 2; 1187 1188 y2 += icvLogTab[h2]; 1189 y3 += icvLogTab[h3]; 1190 1191 x2 = LOGTAB_TRANSLATE( buf[2].f, h2 ); 1192 x3 = LOGTAB_TRANSLATE( buf[3].f, h3 ); 1193 1194 x0 += shift[h0 == 510]; 1195 x1 += shift[h1 == 510]; 1196 y0 += LOGPOLY( x0 ); 1197 y1 += LOGPOLY( x1 ); 1198 1199 y[i] = (float) y0; 1200 y[i + 1] = (float) y1; 1201 1202 x2 += shift[h2 == 510]; 1203 x3 += shift[h3 == 510]; 1204 y2 += LOGPOLY( x2 ); 1205 y3 += LOGPOLY( x3 ); 1206 1207 y[i + 2] = (float) y2; 1208 y[i + 3] = (float) y3; 1209 } 1210 1211 for( ; i < n; i++ ) 1212 { 1213 int h0 = x[i]; 1214 double y0; 1215 float x0; 1216 1217 y0 = (((h0 >> 23) & 0xff) - 127) * ln_2; 1218 1219 buf[0].i = (h0 & LOGTAB_MASK2_32F) | (127 << 23); 1220 h0 = (h0 >> (23 - LOGTAB_SCALE - 1)) & LOGTAB_MASK * 2; 1221 1222 y0 += icvLogTab[h0]; 1223 x0 = (float)LOGTAB_TRANSLATE( buf[0].f, h0 ); 1224 x0 += shift[h0 == 510]; 1225 y0 += LOGPOLY( x0 ); 1226 1227 y[i] = (float)y0; 1228 } 1229 } 1230 1231 void log( const double *x, double *y, int n ) 1232 { 1233 static const double shift[] = { 0, -1./512 }; 1234 static const double 1235 A7 = 1.0, 1236 A6 = -0.5, 1237 A5 = 0.333333333333333314829616256247390992939472198486328125, 1238 A4 = -0.25, 1239 A3 = 0.2, 1240 A2 = -0.1666666666666666574148081281236954964697360992431640625, 1241 A1 = 0.1428571428571428769682682968777953647077083587646484375, 1242 A0 = -0.125; 1243 1244 #undef LOGPOLY 1245 #define LOGPOLY(x,k) ((x)+=shift[k], xq = (x)*(x),\ 1246 (((A0*xq + A2)*xq + A4)*xq + A6)*xq + \ 1247 (((A1*xq + A3)*xq + A5)*xq + A7)*(x)) 1248 1249 int i = 0; 1250 DBLINT buf[4]; 1251 DBLINT *X = (DBLINT *) x; 1252 1253 #if CV_SSE2 1254 static const __m128d ln2_2 = _mm_set1_pd(ln_2); 1255 static const __m128d _1_2 = _mm_set1_pd(1.); 1256 static const __m128d shift2 = _mm_set1_pd(-1./512); 1257 1258 static const __m128i log_and_mask2 = _mm_set_epi32(LOGTAB_MASK2, 0xffffffff, LOGTAB_MASK2, 0xffffffff); 1259 static const __m128i log_or_mask2 = _mm_set_epi32(1023 << 20, 0, 1023 << 20, 0); 1260 1261 static const __m128d mA0 = _mm_set1_pd(A0); 1262 static const __m128d mA1 = _mm_set1_pd(A1); 1263 static const __m128d mA2 = _mm_set1_pd(A2); 1264 static const __m128d mA3 = _mm_set1_pd(A3); 1265 static const __m128d mA4 = _mm_set1_pd(A4); 1266 static const __m128d mA5 = _mm_set1_pd(A5); 1267 static const __m128d mA6 = _mm_set1_pd(A6); 1268 static const __m128d mA7 = _mm_set1_pd(A7); 1269 1270 int CV_DECL_ALIGNED(16) idx[4]; 1271 1272 for( ; i <= n - 4; i += 4 ) 1273 { 1274 __m128i h0 = _mm_loadu_si128((const __m128i*)(x + i)); 1275 __m128i h1 = _mm_loadu_si128((const __m128i*)(x + i + 2)); 1276 1277 __m128d xd0 = _mm_castsi128_pd(_mm_or_si128(_mm_and_si128(h0, log_and_mask2), log_or_mask2)); 1278 __m128d xd1 = _mm_castsi128_pd(_mm_or_si128(_mm_and_si128(h1, log_and_mask2), log_or_mask2)); 1279 1280 h0 = _mm_unpackhi_epi32(_mm_unpacklo_epi32(h0, h1), _mm_unpackhi_epi32(h0, h1)); 1281 1282 __m128i yi0 = _mm_sub_epi32(_mm_and_si128(_mm_srli_epi32(h0, 20), 1283 _mm_set1_epi32(2047)), _mm_set1_epi32(1023)); 1284 __m128d yd0 = _mm_mul_pd(_mm_cvtepi32_pd(yi0), ln2_2); 1285 __m128d yd1 = _mm_mul_pd(_mm_cvtepi32_pd(_mm_unpackhi_epi64(yi0, yi0)), ln2_2); 1286 1287 h0 = _mm_and_si128(_mm_srli_epi32(h0, 20 - LOGTAB_SCALE - 1), _mm_set1_epi32(LOGTAB_MASK * 2)); 1288 _mm_store_si128((__m128i*)idx, h0); 1289 h0 = _mm_cmpeq_epi32(h0, _mm_set1_epi32(510)); 1290 1291 __m128d t0, t1, t2, t3, t4; 1292 t0 = _mm_load_pd(icvLogTab + idx[0]); 1293 t2 = _mm_load_pd(icvLogTab + idx[1]); 1294 t1 = _mm_unpackhi_pd(t0, t2); 1295 t0 = _mm_unpacklo_pd(t0, t2); 1296 t2 = _mm_load_pd(icvLogTab + idx[2]); 1297 t4 = _mm_load_pd(icvLogTab + idx[3]); 1298 t3 = _mm_unpackhi_pd(t2, t4); 1299 t2 = _mm_unpacklo_pd(t2, t4); 1300 1301 yd0 = _mm_add_pd(yd0, t0); 1302 yd1 = _mm_add_pd(yd1, t2); 1303 1304 xd0 = _mm_mul_pd(_mm_sub_pd(xd0, _1_2), t1); 1305 xd1 = _mm_mul_pd(_mm_sub_pd(xd1, _1_2), t3); 1306 1307 xd0 = _mm_add_pd(xd0, _mm_and_pd(_mm_castsi128_pd(_mm_unpacklo_epi32(h0, h0)), shift2)); 1308 xd1 = _mm_add_pd(xd1, _mm_and_pd(_mm_castsi128_pd(_mm_unpackhi_epi32(h0, h0)), shift2)); 1309 1310 __m128d zd0 = _mm_mul_pd(xd0, mA0); 1311 __m128d zd1 = _mm_mul_pd(xd1, mA0); 1312 zd0 = _mm_mul_pd(_mm_add_pd(zd0, mA1), xd0); 1313 zd1 = _mm_mul_pd(_mm_add_pd(zd1, mA1), xd1); 1314 zd0 = _mm_mul_pd(_mm_add_pd(zd0, mA2), xd0); 1315 zd1 = _mm_mul_pd(_mm_add_pd(zd1, mA2), xd1); 1316 zd0 = _mm_mul_pd(_mm_add_pd(zd0, mA3), xd0); 1317 zd1 = _mm_mul_pd(_mm_add_pd(zd1, mA3), xd1); 1318 zd0 = _mm_mul_pd(_mm_add_pd(zd0, mA4), xd0); 1319 zd1 = _mm_mul_pd(_mm_add_pd(zd1, mA4), xd1); 1320 zd0 = _mm_mul_pd(_mm_add_pd(zd0, mA5), xd0); 1321 zd1 = _mm_mul_pd(_mm_add_pd(zd1, mA5), xd1); 1322 zd0 = _mm_mul_pd(_mm_add_pd(zd0, mA6), xd0); 1323 zd1 = _mm_mul_pd(_mm_add_pd(zd1, mA6), xd1); 1324 zd0 = _mm_mul_pd(_mm_add_pd(zd0, mA7), xd0); 1325 zd1 = _mm_mul_pd(_mm_add_pd(zd1, mA7), xd1); 1326 1327 yd0 = _mm_add_pd(yd0, zd0); 1328 yd1 = _mm_add_pd(yd1, zd1); 1329 1330 _mm_storeu_pd(y + i, yd0); 1331 _mm_storeu_pd(y + i + 2, yd1); 1332 } 1333 #endif 1334 for( ; i <= n - 4; i += 4 ) 1335 { 1336 double xq; 1337 double x0, x1, x2, x3; 1338 double y0, y1, y2, y3; 1339 int h0, h1, h2, h3; 1340 1341 h0 = X[i].i.lo; 1342 h1 = X[i + 1].i.lo; 1343 buf[0].i.lo = h0; 1344 buf[1].i.lo = h1; 1345 1346 h0 = X[i].i.hi; 1347 h1 = X[i + 1].i.hi; 1348 buf[0].i.hi = (h0 & LOGTAB_MASK2) | (1023 << 20); 1349 buf[1].i.hi = (h1 & LOGTAB_MASK2) | (1023 << 20); 1350 1351 y0 = (((h0 >> 20) & 0x7ff) - 1023) * ln_2; 1352 y1 = (((h1 >> 20) & 0x7ff) - 1023) * ln_2; 1353 1354 h2 = X[i + 2].i.lo; 1355 h3 = X[i + 3].i.lo; 1356 buf[2].i.lo = h2; 1357 buf[3].i.lo = h3; 1358 1359 h0 = (h0 >> (20 - LOGTAB_SCALE - 1)) & LOGTAB_MASK * 2; 1360 h1 = (h1 >> (20 - LOGTAB_SCALE - 1)) & LOGTAB_MASK * 2; 1361 1362 y0 += icvLogTab[h0]; 1363 y1 += icvLogTab[h1]; 1364 1365 h2 = X[i + 2].i.hi; 1366 h3 = X[i + 3].i.hi; 1367 1368 x0 = LOGTAB_TRANSLATE( buf[0].d, h0 ); 1369 x1 = LOGTAB_TRANSLATE( buf[1].d, h1 ); 1370 1371 buf[2].i.hi = (h2 & LOGTAB_MASK2) | (1023 << 20); 1372 buf[3].i.hi = (h3 & LOGTAB_MASK2) | (1023 << 20); 1373 1374 y2 = (((h2 >> 20) & 0x7ff) - 1023) * ln_2; 1375 y3 = (((h3 >> 20) & 0x7ff) - 1023) * ln_2; 1376 1377 h2 = (h2 >> (20 - LOGTAB_SCALE - 1)) & LOGTAB_MASK * 2; 1378 h3 = (h3 >> (20 - LOGTAB_SCALE - 1)) & LOGTAB_MASK * 2; 1379 1380 y2 += icvLogTab[h2]; 1381 y3 += icvLogTab[h3]; 1382 1383 x2 = LOGTAB_TRANSLATE( buf[2].d, h2 ); 1384 x3 = LOGTAB_TRANSLATE( buf[3].d, h3 ); 1385 1386 y0 += LOGPOLY( x0, h0 == 510 ); 1387 y1 += LOGPOLY( x1, h1 == 510 ); 1388 1389 y[i] = y0; 1390 y[i + 1] = y1; 1391 1392 y2 += LOGPOLY( x2, h2 == 510 ); 1393 y3 += LOGPOLY( x3, h3 == 510 ); 1394 1395 y[i + 2] = y2; 1396 y[i + 3] = y3; 1397 } 1398 1399 for( ; i < n; i++ ) 1400 { 1401 int h0 = X[i].i.hi; 1402 double xq; 1403 double x0, y0 = (((h0 >> 20) & 0x7ff) - 1023) * ln_2; 1404 1405 buf[0].i.hi = (h0 & LOGTAB_MASK2) | (1023 << 20); 1406 buf[0].i.lo = X[i].i.lo; 1407 h0 = (h0 >> (20 - LOGTAB_SCALE - 1)) & LOGTAB_MASK * 2; 1408 1409 y0 += icvLogTab[h0]; 1410 x0 = LOGTAB_TRANSLATE( buf[0].d, h0 ); 1411 y0 += LOGPOLY( x0, h0 == 510 ); 1412 y[i] = y0; 1413 } 1414 } 1415 1416 }} 1417