1 /*M/////////////////////////////////////////////////////////////////////////////////////// 2 // 3 // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. 4 // 5 // By downloading, copying, installing or using the software you agree to this license. 6 // If you do not agree to this license, do not download, install, 7 // copy or use the software. 8 // 9 // 10 // License Agreement 11 // For Open Source Computer Vision Library 12 // 13 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved. 14 // Copyright (C) 2009-2011, Willow Garage Inc., all rights reserved. 15 // Copyright (C) 2014-2015, Itseez Inc., all rights reserved. 16 // Third party copyrights are property of their respective owners. 17 // 18 // Redistribution and use in source and binary forms, with or without modification, 19 // are permitted provided that the following conditions are met: 20 // 21 // * Redistribution's of source code must retain the above copyright notice, 22 // this list of conditions and the following disclaimer. 23 // 24 // * Redistribution's in binary form must reproduce the above copyright notice, 25 // this list of conditions and the following disclaimer in the documentation 26 // and/or other materials provided with the distribution. 27 // 28 // * The name of the copyright holders may not be used to endorse or promote products 29 // derived from this software without specific prior written permission. 30 // 31 // This software is provided by the copyright holders and contributors "as is" and 32 // any express or implied warranties, including, but not limited to, the implied 33 // warranties of merchantability and fitness for a particular purpose are disclaimed. 34 // In no event shall the Intel Corporation or contributors be liable for any direct, 35 // indirect, incidental, special, exemplary, or consequential damages 36 // (including, but not limited to, procurement of substitute goods or services; 37 // loss of use, data, or profits; or business interruption) however caused 38 // and on any theory of liability, whether in contract, strict liability, 39 // or tort (including negligence or otherwise) arising in any way out of 40 // the use of this software, even if advised of the possibility of such damage. 41 // 42 //M*/ 43 44 #include "precomp.hpp" 45 #include "opencl_kernels_core.hpp" 46 #include "opencv2/core/opencl/runtime/opencl_clamdblas.hpp" 47 48 namespace cv 49 { 50 51 /****************************************************************************************\ 52 * GEMM * 53 \****************************************************************************************/ 54 55 static void 56 GEMM_CopyBlock( const uchar* src, size_t src_step, 57 uchar* dst, size_t dst_step, 58 Size size, size_t pix_size ) 59 { 60 int j; 61 size.width *= (int)(pix_size / sizeof(int)); 62 63 for( ; size.height--; src += src_step, dst += dst_step ) 64 { 65 j=0; 66 #if CV_ENABLE_UNROLLED 67 for( ; j <= size.width - 4; j += 4 ) 68 { 69 int t0 = ((const int*)src)[j]; 70 int t1 = ((const int*)src)[j+1]; 71 ((int*)dst)[j] = t0; 72 ((int*)dst)[j+1] = t1; 73 t0 = ((const int*)src)[j+2]; 74 t1 = ((const int*)src)[j+3]; 75 ((int*)dst)[j+2] = t0; 76 ((int*)dst)[j+3] = t1; 77 } 78 #endif 79 for( ; j < size.width; j++ ) 80 ((int*)dst)[j] = ((const int*)src)[j]; 81 } 82 } 83 84 85 static void 86 GEMM_TransposeBlock( const uchar* src, size_t src_step, 87 uchar* dst, size_t dst_step, 88 Size size, size_t pix_size ) 89 { 90 int i, j; 91 for( i = 0; i < size.width; i++, dst += dst_step, src += pix_size ) 92 { 93 const uchar* _src = src; 94 switch( pix_size ) 95 { 96 case sizeof(int): 97 for( j = 0; j < size.height; j++, _src += src_step ) 98 ((int*)dst)[j] = ((int*)_src)[0]; 99 break; 100 case sizeof(int)*2: 101 for( j = 0; j < size.height*2; j += 2, _src += src_step ) 102 { 103 int t0 = ((int*)_src)[0]; 104 int t1 = ((int*)_src)[1]; 105 ((int*)dst)[j] = t0; 106 ((int*)dst)[j+1] = t1; 107 } 108 break; 109 case sizeof(int)*4: 110 for( j = 0; j < size.height*4; j += 4, _src += src_step ) 111 { 112 int t0 = ((int*)_src)[0]; 113 int t1 = ((int*)_src)[1]; 114 ((int*)dst)[j] = t0; 115 ((int*)dst)[j+1] = t1; 116 t0 = ((int*)_src)[2]; 117 t1 = ((int*)_src)[3]; 118 ((int*)dst)[j+2] = t0; 119 ((int*)dst)[j+3] = t1; 120 } 121 break; 122 default: 123 assert(0); 124 return; 125 } 126 } 127 } 128 129 130 template<typename T, typename WT> static void 131 GEMMSingleMul( const T* a_data, size_t a_step, 132 const T* b_data, size_t b_step, 133 const T* c_data, size_t c_step, 134 T* d_data, size_t d_step, 135 Size a_size, Size d_size, 136 double alpha, double beta, int flags ) 137 { 138 int i, j, k, n = a_size.width, m = d_size.width, drows = d_size.height; 139 const T *_a_data = a_data, *_b_data = b_data, *_c_data = c_data; 140 cv::AutoBuffer<T> _a_buf; 141 T* a_buf = 0; 142 size_t a_step0, a_step1, c_step0, c_step1, t_step; 143 144 a_step /= sizeof(a_data[0]); 145 b_step /= sizeof(b_data[0]); 146 c_step /= sizeof(c_data[0]); 147 d_step /= sizeof(d_data[0]); 148 a_step0 = a_step; 149 a_step1 = 1; 150 151 if( !c_data ) 152 c_step0 = c_step1 = 0; 153 else if( !(flags & GEMM_3_T) ) 154 c_step0 = c_step, c_step1 = 1; 155 else 156 c_step0 = 1, c_step1 = c_step; 157 158 if( flags & GEMM_1_T ) 159 { 160 CV_SWAP( a_step0, a_step1, t_step ); 161 n = a_size.height; 162 if( a_step > 1 && n > 1 ) 163 { 164 _a_buf.allocate(n); 165 a_buf = _a_buf; 166 } 167 } 168 169 if( n == 1 ) /* external product */ 170 { 171 cv::AutoBuffer<T> _b_buf; 172 T* b_buf = 0; 173 174 if( a_step > 1 && a_size.height > 1 ) 175 { 176 _a_buf.allocate(drows); 177 a_buf = _a_buf; 178 for( k = 0; k < drows; k++ ) 179 a_buf[k] = a_data[a_step*k]; 180 a_data = a_buf; 181 } 182 183 if( b_step > 1 ) 184 { 185 _b_buf.allocate(d_size.width); 186 b_buf = _b_buf; 187 for( j = 0; j < d_size.width; j++ ) 188 b_buf[j] = b_data[j*b_step]; 189 b_data = b_buf; 190 } 191 192 for( i = 0; i < drows; i++, _c_data += c_step0, d_data += d_step ) 193 { 194 WT al = WT(a_data[i])*alpha; 195 c_data = _c_data; 196 for( j = 0; j <= d_size.width - 2; j += 2, c_data += 2*c_step1 ) 197 { 198 WT s0 = al*WT(b_data[j]); 199 WT s1 = al*WT(b_data[j+1]); 200 if( !c_data ) 201 { 202 d_data[j] = T(s0); 203 d_data[j+1] = T(s1); 204 } 205 else 206 { 207 d_data[j] = T(s0 + WT(c_data[0])*beta); 208 d_data[j+1] = T(s1 + WT(c_data[c_step1])*beta); 209 } 210 } 211 212 for( ; j < d_size.width; j++, c_data += c_step1 ) 213 { 214 WT s0 = al*WT(b_data[j]); 215 if( !c_data ) 216 d_data[j] = T(s0); 217 else 218 d_data[j] = T(s0 + WT(c_data[0])*beta); 219 } 220 } 221 } 222 else if( flags & GEMM_2_T ) /* A * Bt */ 223 { 224 for( i = 0; i < drows; i++, _a_data += a_step0, _c_data += c_step0, d_data += d_step ) 225 { 226 a_data = _a_data; 227 b_data = _b_data; 228 c_data = _c_data; 229 230 if( a_buf ) 231 { 232 for( k = 0; k < n; k++ ) 233 a_buf[k] = a_data[a_step1*k]; 234 a_data = a_buf; 235 } 236 237 for( j = 0; j < d_size.width; j++, b_data += b_step, 238 c_data += c_step1 ) 239 { 240 WT s0(0), s1(0), s2(0), s3(0); 241 k = 0; 242 #if CV_ENABLE_UNROLLED 243 for( ; k <= n - 4; k += 4 ) 244 { 245 s0 += WT(a_data[k])*WT(b_data[k]); 246 s1 += WT(a_data[k+1])*WT(b_data[k+1]); 247 s2 += WT(a_data[k+2])*WT(b_data[k+2]); 248 s3 += WT(a_data[k+3])*WT(b_data[k+3]); 249 } 250 #endif 251 for( ; k < n; k++ ) 252 s0 += WT(a_data[k])*WT(b_data[k]); 253 s0 = (s0+s1+s2+s3)*alpha; 254 255 if( !c_data ) 256 d_data[j] = T(s0); 257 else 258 d_data[j] = T(s0 + WT(c_data[0])*beta); 259 } 260 } 261 } 262 else if( d_size.width*sizeof(d_data[0]) <= 1600 ) 263 { 264 for( i = 0; i < drows; i++, _a_data += a_step0, 265 _c_data += c_step0, 266 d_data += d_step ) 267 { 268 a_data = _a_data, c_data = _c_data; 269 270 if( a_buf ) 271 { 272 for( k = 0; k < n; k++ ) 273 a_buf[k] = a_data[a_step1*k]; 274 a_data = a_buf; 275 } 276 277 for( j = 0; j <= m - 4; j += 4, c_data += 4*c_step1 ) 278 { 279 const T* b = _b_data + j; 280 WT s0(0), s1(0), s2(0), s3(0); 281 282 for( k = 0; k < n; k++, b += b_step ) 283 { 284 WT a(a_data[k]); 285 s0 += a * WT(b[0]); s1 += a * WT(b[1]); 286 s2 += a * WT(b[2]); s3 += a * WT(b[3]); 287 } 288 289 if( !c_data ) 290 { 291 d_data[j] = T(s0*alpha); 292 d_data[j+1] = T(s1*alpha); 293 d_data[j+2] = T(s2*alpha); 294 d_data[j+3] = T(s3*alpha); 295 } 296 else 297 { 298 s0 = s0*alpha; s1 = s1*alpha; 299 s2 = s2*alpha; s3 = s3*alpha; 300 d_data[j] = T(s0 + WT(c_data[0])*beta); 301 d_data[j+1] = T(s1 + WT(c_data[c_step1])*beta); 302 d_data[j+2] = T(s2 + WT(c_data[c_step1*2])*beta); 303 d_data[j+3] = T(s3 + WT(c_data[c_step1*3])*beta); 304 } 305 } 306 307 for( ; j < m; j++, c_data += c_step1 ) 308 { 309 const T* b = _b_data + j; 310 WT s0(0); 311 312 for( k = 0; k < n; k++, b += b_step ) 313 s0 += WT(a_data[k]) * WT(b[0]); 314 315 s0 = s0*alpha; 316 if( !c_data ) 317 d_data[j] = T(s0); 318 else 319 d_data[j] = T(s0 + WT(c_data[0])*beta); 320 } 321 } 322 } 323 else 324 { 325 cv::AutoBuffer<WT> _d_buf(m); 326 WT* d_buf = _d_buf; 327 328 for( i = 0; i < drows; i++, _a_data += a_step0, _c_data += c_step0, d_data += d_step ) 329 { 330 a_data = _a_data; 331 b_data = _b_data; 332 c_data = _c_data; 333 334 if( a_buf ) 335 { 336 for( k = 0; k < n; k++ ) 337 a_buf[k] = _a_data[a_step1*k]; 338 a_data = a_buf; 339 } 340 341 for( j = 0; j < m; j++ ) 342 d_buf[j] = WT(0); 343 344 for( k = 0; k < n; k++, b_data += b_step ) 345 { 346 WT al(a_data[k]); 347 j=0; 348 #if CV_ENABLE_UNROLLED 349 for(; j <= m - 4; j += 4 ) 350 { 351 WT t0 = d_buf[j] + WT(b_data[j])*al; 352 WT t1 = d_buf[j+1] + WT(b_data[j+1])*al; 353 d_buf[j] = t0; 354 d_buf[j+1] = t1; 355 t0 = d_buf[j+2] + WT(b_data[j+2])*al; 356 t1 = d_buf[j+3] + WT(b_data[j+3])*al; 357 d_buf[j+2] = t0; 358 d_buf[j+3] = t1; 359 } 360 #endif 361 for( ; j < m; j++ ) 362 d_buf[j] += WT(b_data[j])*al; 363 } 364 365 if( !c_data ) 366 for( j = 0; j < m; j++ ) 367 d_data[j] = T(d_buf[j]*alpha); 368 else 369 for( j = 0; j < m; j++, c_data += c_step1 ) 370 { 371 WT t = d_buf[j]*alpha; 372 d_data[j] = T(t + WT(c_data[0])*beta); 373 } 374 } 375 } 376 } 377 378 379 template<typename T, typename WT> static void 380 GEMMBlockMul( const T* a_data, size_t a_step, 381 const T* b_data, size_t b_step, 382 WT* d_data, size_t d_step, 383 Size a_size, Size d_size, int flags ) 384 { 385 int i, j, k, n = a_size.width, m = d_size.width; 386 const T *_a_data = a_data, *_b_data = b_data; 387 cv::AutoBuffer<T> _a_buf; 388 T* a_buf = 0; 389 size_t a_step0, a_step1, t_step; 390 int do_acc = flags & 16; 391 392 a_step /= sizeof(a_data[0]); 393 b_step /= sizeof(b_data[0]); 394 d_step /= sizeof(d_data[0]); 395 396 a_step0 = a_step; 397 a_step1 = 1; 398 399 if( flags & GEMM_1_T ) 400 { 401 CV_SWAP( a_step0, a_step1, t_step ); 402 n = a_size.height; 403 _a_buf.allocate(n); 404 a_buf = _a_buf; 405 } 406 407 if( flags & GEMM_2_T ) 408 { 409 /* second operand is transposed */ 410 for( i = 0; i < d_size.height; i++, _a_data += a_step0, d_data += d_step ) 411 { 412 a_data = _a_data; b_data = _b_data; 413 414 if( a_buf ) 415 { 416 for( k = 0; k < n; k++ ) 417 a_buf[k] = a_data[a_step1*k]; 418 a_data = a_buf; 419 } 420 421 for( j = 0; j < d_size.width; j++, b_data += b_step ) 422 { 423 WT s0 = do_acc ? d_data[j]:WT(0), s1(0); 424 for( k = 0; k <= n - 2; k += 2 ) 425 { 426 s0 += WT(a_data[k])*WT(b_data[k]); 427 s1 += WT(a_data[k+1])*WT(b_data[k+1]); 428 } 429 430 for( ; k < n; k++ ) 431 s0 += WT(a_data[k])*WT(b_data[k]); 432 433 d_data[j] = s0 + s1; 434 } 435 } 436 } 437 else 438 { 439 for( i = 0; i < d_size.height; i++, _a_data += a_step0, d_data += d_step ) 440 { 441 a_data = _a_data, b_data = _b_data; 442 443 if( a_buf ) 444 { 445 for( k = 0; k < n; k++ ) 446 a_buf[k] = a_data[a_step1*k]; 447 a_data = a_buf; 448 } 449 450 for( j = 0; j <= m - 4; j += 4 ) 451 { 452 WT s0, s1, s2, s3; 453 const T* b = b_data + j; 454 455 if( do_acc ) 456 { 457 s0 = d_data[j]; s1 = d_data[j+1]; 458 s2 = d_data[j+2]; s3 = d_data[j+3]; 459 } 460 else 461 s0 = s1 = s2 = s3 = WT(0); 462 463 for( k = 0; k < n; k++, b += b_step ) 464 { 465 WT a(a_data[k]); 466 s0 += a * WT(b[0]); s1 += a * WT(b[1]); 467 s2 += a * WT(b[2]); s3 += a * WT(b[3]); 468 } 469 470 d_data[j] = s0; d_data[j+1] = s1; 471 d_data[j+2] = s2; d_data[j+3] = s3; 472 } 473 474 for( ; j < m; j++ ) 475 { 476 const T* b = b_data + j; 477 WT s0 = do_acc ? d_data[j] : WT(0); 478 479 for( k = 0; k < n; k++, b += b_step ) 480 s0 += WT(a_data[k]) * WT(b[0]); 481 482 d_data[j] = s0; 483 } 484 } 485 } 486 } 487 488 489 template<typename T, typename WT> static void 490 GEMMStore( const T* c_data, size_t c_step, 491 const WT* d_buf, size_t d_buf_step, 492 T* d_data, size_t d_step, Size d_size, 493 double alpha, double beta, int flags ) 494 { 495 const T* _c_data = c_data; 496 int j; 497 size_t c_step0, c_step1; 498 499 c_step /= sizeof(c_data[0]); 500 d_buf_step /= sizeof(d_buf[0]); 501 d_step /= sizeof(d_data[0]); 502 503 if( !c_data ) 504 c_step0 = c_step1 = 0; 505 else if( !(flags & GEMM_3_T) ) 506 c_step0 = c_step, c_step1 = 1; 507 else 508 c_step0 = 1, c_step1 = c_step; 509 510 for( ; d_size.height--; _c_data += c_step0, d_buf += d_buf_step, d_data += d_step ) 511 { 512 if( _c_data ) 513 { 514 c_data = _c_data; 515 j=0; 516 #if CV_ENABLE_UNROLLED 517 for(; j <= d_size.width - 4; j += 4, c_data += 4*c_step1 ) 518 { 519 WT t0 = alpha*d_buf[j]; 520 WT t1 = alpha*d_buf[j+1]; 521 t0 += beta*WT(c_data[0]); 522 t1 += beta*WT(c_data[c_step1]); 523 d_data[j] = T(t0); 524 d_data[j+1] = T(t1); 525 t0 = alpha*d_buf[j+2]; 526 t1 = alpha*d_buf[j+3]; 527 t0 += beta*WT(c_data[c_step1*2]); 528 t1 += beta*WT(c_data[c_step1*3]); 529 d_data[j+2] = T(t0); 530 d_data[j+3] = T(t1); 531 } 532 #endif 533 for( ; j < d_size.width; j++, c_data += c_step1 ) 534 { 535 WT t0 = alpha*d_buf[j]; 536 d_data[j] = T(t0 + WT(c_data[0])*beta); 537 } 538 } 539 else 540 { 541 j = 0; 542 #if CV_ENABLE_UNROLLED 543 for( ; j <= d_size.width - 4; j += 4 ) 544 { 545 WT t0 = alpha*d_buf[j]; 546 WT t1 = alpha*d_buf[j+1]; 547 d_data[j] = T(t0); 548 d_data[j+1] = T(t1); 549 t0 = alpha*d_buf[j+2]; 550 t1 = alpha*d_buf[j+3]; 551 d_data[j+2] = T(t0); 552 d_data[j+3] = T(t1); 553 } 554 #endif 555 for( ; j < d_size.width; j++ ) 556 d_data[j] = T(alpha*d_buf[j]); 557 } 558 } 559 } 560 561 562 typedef void (*GEMMSingleMulFunc)( const void* src1, size_t step1, 563 const void* src2, size_t step2, const void* src3, size_t step3, 564 void* dst, size_t dststep, Size srcsize, Size dstsize, 565 double alpha, double beta, int flags ); 566 567 typedef void (*GEMMBlockMulFunc)( const void* src1, size_t step1, 568 const void* src2, size_t step2, void* dst, size_t dststep, 569 Size srcsize, Size dstsize, int flags ); 570 571 typedef void (*GEMMStoreFunc)( const void* src1, size_t step1, 572 const void* src2, size_t step2, void* dst, size_t dststep, 573 Size dstsize, double alpha, double beta, int flags ); 574 575 static void GEMMSingleMul_32f( const float* a_data, size_t a_step, 576 const float* b_data, size_t b_step, 577 const float* c_data, size_t c_step, 578 float* d_data, size_t d_step, 579 Size a_size, Size d_size, 580 double alpha, double beta, int flags ) 581 { 582 GEMMSingleMul<float,double>(a_data, a_step, b_data, b_step, c_data, 583 c_step, d_data, d_step, a_size, d_size, 584 alpha, beta, flags); 585 } 586 587 static void GEMMSingleMul_64f( const double* a_data, size_t a_step, 588 const double* b_data, size_t b_step, 589 const double* c_data, size_t c_step, 590 double* d_data, size_t d_step, 591 Size a_size, Size d_size, 592 double alpha, double beta, int flags ) 593 { 594 GEMMSingleMul<double,double>(a_data, a_step, b_data, b_step, c_data, 595 c_step, d_data, d_step, a_size, d_size, 596 alpha, beta, flags); 597 } 598 599 600 static void GEMMSingleMul_32fc( const Complexf* a_data, size_t a_step, 601 const Complexf* b_data, size_t b_step, 602 const Complexf* c_data, size_t c_step, 603 Complexf* d_data, size_t d_step, 604 Size a_size, Size d_size, 605 double alpha, double beta, int flags ) 606 { 607 GEMMSingleMul<Complexf,Complexd>(a_data, a_step, b_data, b_step, c_data, 608 c_step, d_data, d_step, a_size, d_size, 609 alpha, beta, flags); 610 } 611 612 static void GEMMSingleMul_64fc( const Complexd* a_data, size_t a_step, 613 const Complexd* b_data, size_t b_step, 614 const Complexd* c_data, size_t c_step, 615 Complexd* d_data, size_t d_step, 616 Size a_size, Size d_size, 617 double alpha, double beta, int flags ) 618 { 619 GEMMSingleMul<Complexd,Complexd>(a_data, a_step, b_data, b_step, c_data, 620 c_step, d_data, d_step, a_size, d_size, 621 alpha, beta, flags); 622 } 623 624 static void GEMMBlockMul_32f( const float* a_data, size_t a_step, 625 const float* b_data, size_t b_step, 626 double* d_data, size_t d_step, 627 Size a_size, Size d_size, int flags ) 628 { 629 GEMMBlockMul(a_data, a_step, b_data, b_step, d_data, d_step, a_size, d_size, flags); 630 } 631 632 633 static void GEMMBlockMul_64f( const double* a_data, size_t a_step, 634 const double* b_data, size_t b_step, 635 double* d_data, size_t d_step, 636 Size a_size, Size d_size, int flags ) 637 { 638 GEMMBlockMul(a_data, a_step, b_data, b_step, d_data, d_step, a_size, d_size, flags); 639 } 640 641 642 static void GEMMBlockMul_32fc( const Complexf* a_data, size_t a_step, 643 const Complexf* b_data, size_t b_step, 644 Complexd* d_data, size_t d_step, 645 Size a_size, Size d_size, int flags ) 646 { 647 GEMMBlockMul(a_data, a_step, b_data, b_step, d_data, d_step, a_size, d_size, flags); 648 } 649 650 651 static void GEMMBlockMul_64fc( const Complexd* a_data, size_t a_step, 652 const Complexd* b_data, size_t b_step, 653 Complexd* d_data, size_t d_step, 654 Size a_size, Size d_size, int flags ) 655 { 656 GEMMBlockMul(a_data, a_step, b_data, b_step, d_data, d_step, a_size, d_size, flags); 657 } 658 659 660 static void GEMMStore_32f( const float* c_data, size_t c_step, 661 const double* d_buf, size_t d_buf_step, 662 float* d_data, size_t d_step, Size d_size, 663 double alpha, double beta, int flags ) 664 { 665 GEMMStore(c_data, c_step, d_buf, d_buf_step, d_data, d_step, d_size, alpha, beta, flags); 666 } 667 668 669 static void GEMMStore_64f( const double* c_data, size_t c_step, 670 const double* d_buf, size_t d_buf_step, 671 double* d_data, size_t d_step, Size d_size, 672 double alpha, double beta, int flags ) 673 { 674 GEMMStore(c_data, c_step, d_buf, d_buf_step, d_data, d_step, d_size, alpha, beta, flags); 675 } 676 677 678 static void GEMMStore_32fc( const Complexf* c_data, size_t c_step, 679 const Complexd* d_buf, size_t d_buf_step, 680 Complexf* d_data, size_t d_step, Size d_size, 681 double alpha, double beta, int flags ) 682 { 683 GEMMStore(c_data, c_step, d_buf, d_buf_step, d_data, d_step, d_size, alpha, beta, flags); 684 } 685 686 687 static void GEMMStore_64fc( const Complexd* c_data, size_t c_step, 688 const Complexd* d_buf, size_t d_buf_step, 689 Complexd* d_data, size_t d_step, Size d_size, 690 double alpha, double beta, int flags ) 691 { 692 GEMMStore(c_data, c_step, d_buf, d_buf_step, d_data, d_step, d_size, alpha, beta, flags); 693 } 694 695 #ifdef HAVE_CLAMDBLAS 696 697 static bool ocl_gemm_amdblas( InputArray matA, InputArray matB, double alpha, 698 InputArray matC, double beta, OutputArray matD, int flags ) 699 { 700 int type = matA.type(), esz = CV_ELEM_SIZE(type); 701 bool haveC = matC.kind() != cv::_InputArray::NONE; 702 Size sizeA = matA.size(), sizeB = matB.size(), sizeC = haveC ? matC.size() : Size(0, 0); 703 bool atrans = (flags & GEMM_1_T) != 0, btrans = (flags & GEMM_2_T) != 0, ctrans = (flags & GEMM_3_T) != 0; 704 705 if (atrans) 706 sizeA = Size(sizeA.height, sizeA.width); 707 if (btrans) 708 sizeB = Size(sizeB.height, sizeB.width); 709 if (haveC && ctrans) 710 sizeC = Size(sizeC.height, sizeC.width); 711 712 Size sizeD(sizeB.width, sizeA.height); 713 714 CV_Assert( matB.type() == type && (!haveC || matC.type() == type) ); 715 CV_Assert( sizeA.width == sizeB.height && (!haveC || sizeC == sizeD) ); 716 717 matD.create(sizeD, type); 718 if ( matA.offset() % esz != 0 || matA.step() % esz != 0 || 719 matB.offset() % esz != 0 || matB.step() % esz != 0 || 720 (haveC && (matC.offset() % esz != 0 || matC.step() % esz != 0)) ) 721 return false; 722 723 UMat A = matA.getUMat(), B = matB.getUMat(), D = matD.getUMat(); 724 if (!ocl::internal::isCLBuffer(A) || !ocl::internal::isCLBuffer(B) || !ocl::internal::isCLBuffer(D)) 725 { 726 return false; 727 } 728 if (haveC) 729 { 730 UMat C = matC.getUMat(); 731 if (!ocl::internal::isCLBuffer(C)) 732 return false; 733 } 734 if (haveC) 735 ctrans ? transpose(matC, D) : matC.copyTo(D); 736 else 737 D.setTo(Scalar::all(0)); 738 739 int M = sizeD.height, N = sizeD.width, K = sizeA.width; 740 int lda = (int)A.step / esz, ldb = (int)B.step / esz, ldc = (int)D.step / esz; 741 int offa = (int)A.offset / esz, offb = (int)B.offset / esz, offc = (int)D.offset / esz; 742 743 cl_command_queue clq = (cl_command_queue)ocl::Queue::getDefault().ptr(); 744 clAmdBlasTranspose transA = atrans ? clAmdBlasTrans : clAmdBlasNoTrans; 745 clAmdBlasTranspose transB = btrans ? clAmdBlasTrans : clAmdBlasNoTrans; 746 clAmdBlasOrder order = clAmdBlasRowMajor; 747 clAmdBlasStatus status = clAmdBlasSuccess; 748 749 if (type == CV_32FC1) 750 status = clAmdBlasSgemmEx(order, transA, transB, M, N, K, 751 (cl_float)alpha, (const cl_mem)A.handle(ACCESS_READ), offa, lda, 752 (const cl_mem)B.handle(ACCESS_READ), offb, ldb, 753 (cl_float)beta, (cl_mem)D.handle(ACCESS_RW), offc, ldc, 754 1, &clq, 0, NULL, NULL); 755 else if (type == CV_64FC1) 756 status = clAmdBlasDgemmEx(order, transA, transB, M, N, K, 757 alpha, (const cl_mem)A.handle(ACCESS_READ), offa, lda, 758 (const cl_mem)B.handle(ACCESS_READ), offb, ldb, 759 beta, (cl_mem)D.handle(ACCESS_RW), offc, ldc, 760 1, &clq, 0, NULL, NULL); 761 else if (type == CV_32FC2) 762 { 763 cl_float2 alpha_2 = { { (cl_float)alpha, 0 } }; 764 cl_float2 beta_2 = { { (cl_float)beta, 0 } }; 765 status = clAmdBlasCgemmEx(order, transA, transB, M, N, K, 766 alpha_2, (const cl_mem)A.handle(ACCESS_READ), offa, lda, 767 (const cl_mem)B.handle(ACCESS_READ), offb, ldb, 768 beta_2, (cl_mem)D.handle(ACCESS_RW), offc, ldc, 769 1, &clq, 0, NULL, NULL); 770 } 771 else if (type == CV_64FC2) 772 { 773 cl_double2 alpha_2 = { { alpha, 0 } }; 774 cl_double2 beta_2 = { { beta, 0 } }; 775 status = clAmdBlasZgemmEx(order, transA, transB, M, N, K, 776 alpha_2, (const cl_mem)A.handle(ACCESS_READ), offa, lda, 777 (const cl_mem)B.handle(ACCESS_READ), offb, ldb, 778 beta_2, (cl_mem)D.handle(ACCESS_RW), offc, ldc, 779 1, &clq, 0, NULL, NULL); 780 } 781 else 782 CV_Error(Error::StsUnsupportedFormat, ""); 783 784 return status == clAmdBlasSuccess; 785 } 786 787 #endif 788 789 #ifdef HAVE_OPENCL 790 791 static bool ocl_gemm( InputArray matA, InputArray matB, double alpha, 792 InputArray matC, double beta, OutputArray matD, int flags ) 793 { 794 int depth = matA.depth(), cn = matA.channels(); 795 int type = CV_MAKETYPE(depth, cn); 796 797 CV_Assert( type == matB.type() && (type == CV_32FC1 || type == CV_64FC1 || type == CV_32FC2 || type == CV_64FC2) ); 798 799 const ocl::Device & dev = ocl::Device::getDefault(); 800 bool doubleSupport = dev.doubleFPConfig() > 0; 801 802 if (!doubleSupport && depth == CV_64F) 803 return false; 804 805 bool haveC = matC.kind() != cv::_InputArray::NONE; 806 Size sizeA = matA.size(), sizeB = matB.size(), sizeC = haveC ? matC.size() : Size(0, 0); 807 bool atrans = (flags & GEMM_1_T) != 0, btrans = (flags & GEMM_2_T) != 0, ctrans = (flags & GEMM_3_T) != 0; 808 809 if (atrans) 810 sizeA = Size(sizeA.height, sizeA.width); 811 if (btrans) 812 sizeB = Size(sizeB.height, sizeB.width); 813 if (haveC && ctrans) 814 sizeC = Size(sizeC.height, sizeC.width); 815 816 Size sizeD(sizeB.width, sizeA.height); 817 818 CV_Assert( !haveC || matC.type() == type ); 819 CV_Assert( sizeA.width == sizeB.height && (!haveC || sizeC == sizeD) ); 820 821 int max_wg_size = (int)dev.maxWorkGroupSize(); 822 int block_size = (max_wg_size / (32*cn) < 32) ? (max_wg_size / (16*cn) < 16) ? (max_wg_size / (8*cn) < 8) ? 1 : 8 : 16 : 32; 823 824 matD.create(sizeD, type); 825 826 UMat A = matA.getUMat(), B = matB.getUMat(), D = matD.getUMat(); 827 828 if (atrans) 829 A = A.t(); 830 831 if (btrans) 832 B = B.t(); 833 834 if (haveC) 835 ctrans ? transpose(matC, D) : matC.copyTo(D); 836 837 int vectorWidths[] = { 4, 4, 2, 2, 1, 4, cn, -1 }; 838 int kercn = ocl::checkOptimalVectorWidth(vectorWidths, B, D); 839 840 String opts = format("-D T=%s -D T1=%s -D WT=%s -D cn=%d -D kercn=%d -D LOCAL_SIZE=%d %s %s %s", 841 ocl::typeToStr(type), ocl::typeToStr(depth), ocl::typeToStr(CV_MAKETYPE(depth, kercn)), 842 cn, kercn, block_size, 843 (sizeA.width % block_size !=0) ? "-D NO_MULT" : "", 844 haveC ? "-D HAVE_C" : "", 845 doubleSupport ? " -D DOUBLE_SUPPORT" : ""); 846 847 ocl::Kernel k("gemm", cv::ocl::core::gemm_oclsrc, opts); 848 if (k.empty()) 849 return false; 850 851 if (depth == CV_64F) 852 k.args(ocl::KernelArg::ReadOnlyNoSize(A), 853 ocl::KernelArg::ReadOnlyNoSize(B, cn, kercn), 854 ocl::KernelArg::ReadWrite(D, cn, kercn), 855 sizeA.width, alpha, beta); 856 else 857 k.args(ocl::KernelArg::ReadOnlyNoSize(A), 858 ocl::KernelArg::ReadOnlyNoSize(B, cn, kercn), 859 ocl::KernelArg::ReadWrite(D, cn, kercn), 860 sizeA.width, (float)alpha, (float)beta); 861 862 size_t globalsize[2] = { sizeD.width * cn / kercn, sizeD.height}; 863 size_t localsize[2] = { block_size, block_size}; 864 return k.run(2, globalsize, block_size!=1 ? localsize : NULL, false); 865 } 866 #endif 867 } 868 869 void cv::gemm( InputArray matA, InputArray matB, double alpha, 870 InputArray matC, double beta, OutputArray _matD, int flags ) 871 { 872 #ifdef HAVE_CLAMDBLAS 873 CV_OCL_RUN(ocl::haveAmdBlas() && matA.dims() <= 2 && matB.dims() <= 2 && matC.dims() <= 2 && _matD.isUMat() && 874 matA.cols() > 20 && matA.rows() > 20 && matB.cols() > 20, // since it works incorrect for small sizes 875 ocl_gemm_amdblas(matA, matB, alpha, matC, beta, _matD, flags)) 876 #endif 877 878 #ifdef HAVE_OPENCL 879 CV_OCL_RUN(_matD.isUMat() && matA.dims() <= 2 && matB.dims() <= 2 && matC.dims() <= 2, 880 ocl_gemm(matA, matB, alpha, matC, beta, _matD, flags)) 881 #endif 882 883 const int block_lin_size = 128; 884 const int block_size = block_lin_size * block_lin_size; 885 886 static double zero[] = {0,0,0,0}; 887 static float zerof[] = {0,0,0,0}; 888 889 Mat A = matA.getMat(), B = matB.getMat(), C = beta != 0 ? matC.getMat() : Mat(); 890 Size a_size = A.size(), d_size; 891 int i, len = 0, type = A.type(); 892 893 CV_Assert( type == B.type() && (type == CV_32FC1 || type == CV_64FC1 || type == CV_32FC2 || type == CV_64FC2) ); 894 895 switch( flags & (GEMM_1_T|GEMM_2_T) ) 896 { 897 case 0: 898 d_size = Size( B.cols, a_size.height ); 899 len = B.rows; 900 CV_Assert( a_size.width == len ); 901 break; 902 case 1: 903 d_size = Size( B.cols, a_size.width ); 904 len = B.rows; 905 CV_Assert( a_size.height == len ); 906 break; 907 case 2: 908 d_size = Size( B.rows, a_size.height ); 909 len = B.cols; 910 CV_Assert( a_size.width == len ); 911 break; 912 case 3: 913 d_size = Size( B.rows, a_size.width ); 914 len = B.cols; 915 CV_Assert( a_size.height == len ); 916 break; 917 } 918 919 if( !C.empty() ) 920 { 921 CV_Assert( C.type() == type && 922 (((flags&GEMM_3_T) == 0 && C.rows == d_size.height && C.cols == d_size.width) || 923 ((flags&GEMM_3_T) != 0 && C.rows == d_size.width && C.cols == d_size.height))); 924 } 925 926 _matD.create( d_size.height, d_size.width, type ); 927 Mat D = _matD.getMat(); 928 if( (flags & GEMM_3_T) != 0 && C.data == D.data ) 929 { 930 transpose( C, C ); 931 flags &= ~GEMM_3_T; 932 } 933 934 if( flags == 0 && 2 <= len && len <= 4 && (len == d_size.width || len == d_size.height) ) 935 { 936 if( type == CV_32F ) 937 { 938 float* d = D.ptr<float>(); 939 const float *a = A.ptr<float>(), 940 *b = B.ptr<float>(), 941 *c = (const float*)C.data; 942 size_t d_step = D.step/sizeof(d[0]), 943 a_step = A.step/sizeof(a[0]), 944 b_step = B.step/sizeof(b[0]), 945 c_step = C.data ? C.step/sizeof(c[0]) : 0; 946 947 if( !c ) 948 c = zerof; 949 950 switch( len ) 951 { 952 case 2: 953 if( len == d_size.width && b != d ) 954 { 955 for( i = 0; i < d_size.height; i++, d += d_step, a += a_step, c += c_step ) 956 { 957 float t0 = a[0]*b[0] + a[1]*b[b_step]; 958 float t1 = a[0]*b[1] + a[1]*b[b_step+1]; 959 d[0] = (float)(t0*alpha + c[0]*beta); 960 d[1] = (float)(t1*alpha + c[1]*beta); 961 } 962 } 963 else if( a != d ) 964 { 965 int c_step0 = 1; 966 if( c == zerof ) 967 { 968 c_step0 = 0; 969 c_step = 1; 970 } 971 972 for( i = 0; i < d_size.width; i++, d++, b++, c += c_step0 ) 973 { 974 float t0 = a[0]*b[0] + a[1]*b[b_step]; 975 float t1 = a[a_step]*b[0] + a[a_step+1]*b[b_step]; 976 d[0] = (float)(t0*alpha + c[0]*beta); 977 d[d_step] = (float)(t1*alpha + c[c_step]*beta); 978 } 979 } 980 else 981 break; 982 return; 983 case 3: 984 if( len == d_size.width && b != d ) 985 { 986 for( i = 0; i < d_size.height; i++, d += d_step, a += a_step, c += c_step ) 987 { 988 float t0 = a[0]*b[0] + a[1]*b[b_step] + a[2]*b[b_step*2]; 989 float t1 = a[0]*b[1] + a[1]*b[b_step+1] + a[2]*b[b_step*2+1]; 990 float t2 = a[0]*b[2] + a[1]*b[b_step+2] + a[2]*b[b_step*2+2]; 991 d[0] = (float)(t0*alpha + c[0]*beta); 992 d[1] = (float)(t1*alpha + c[1]*beta); 993 d[2] = (float)(t2*alpha + c[2]*beta); 994 } 995 } 996 else if( a != d ) 997 { 998 int c_step0 = 1; 999 if( c == zerof ) 1000 { 1001 c_step0 = 0; 1002 c_step = 1; 1003 } 1004 1005 for( i = 0; i < d_size.width; i++, d++, b++, c += c_step0 ) 1006 { 1007 float t0 = a[0]*b[0] + a[1]*b[b_step] + a[2]*b[b_step*2]; 1008 float t1 = a[a_step]*b[0] + a[a_step+1]*b[b_step] + a[a_step+2]*b[b_step*2]; 1009 float t2 = a[a_step*2]*b[0] + a[a_step*2+1]*b[b_step] + a[a_step*2+2]*b[b_step*2]; 1010 1011 d[0] = (float)(t0*alpha + c[0]*beta); 1012 d[d_step] = (float)(t1*alpha + c[c_step]*beta); 1013 d[d_step*2] = (float)(t2*alpha + c[c_step*2]*beta); 1014 } 1015 } 1016 else 1017 break; 1018 return; 1019 case 4: 1020 if( len == d_size.width && b != d ) 1021 { 1022 for( i = 0; i < d_size.height; i++, d += d_step, a += a_step, c += c_step ) 1023 { 1024 float t0 = a[0]*b[0] + a[1]*b[b_step] + a[2]*b[b_step*2] + a[3]*b[b_step*3]; 1025 float t1 = a[0]*b[1] + a[1]*b[b_step+1] + a[2]*b[b_step*2+1] + a[3]*b[b_step*3+1]; 1026 float t2 = a[0]*b[2] + a[1]*b[b_step+2] + a[2]*b[b_step*2+2] + a[3]*b[b_step*3+2]; 1027 float t3 = a[0]*b[3] + a[1]*b[b_step+3] + a[2]*b[b_step*2+3] + a[3]*b[b_step*3+3]; 1028 d[0] = (float)(t0*alpha + c[0]*beta); 1029 d[1] = (float)(t1*alpha + c[1]*beta); 1030 d[2] = (float)(t2*alpha + c[2]*beta); 1031 d[3] = (float)(t3*alpha + c[3]*beta); 1032 } 1033 } 1034 else if( len <= 16 && a != d ) 1035 { 1036 int c_step0 = 1; 1037 if( c == zerof ) 1038 { 1039 c_step0 = 0; 1040 c_step = 1; 1041 } 1042 1043 for( i = 0; i < d_size.width; i++, d++, b++, c += c_step0 ) 1044 { 1045 float t0 = a[0]*b[0] + a[1]*b[b_step] + a[2]*b[b_step*2] + a[3]*b[b_step*3]; 1046 float t1 = a[a_step]*b[0] + a[a_step+1]*b[b_step] + 1047 a[a_step+2]*b[b_step*2] + a[a_step+3]*b[b_step*3]; 1048 float t2 = a[a_step*2]*b[0] + a[a_step*2+1]*b[b_step] + 1049 a[a_step*2+2]*b[b_step*2] + a[a_step*2+3]*b[b_step*3]; 1050 float t3 = a[a_step*3]*b[0] + a[a_step*3+1]*b[b_step] + 1051 a[a_step*3+2]*b[b_step*2] + a[a_step*3+3]*b[b_step*3]; 1052 d[0] = (float)(t0*alpha + c[0]*beta); 1053 d[d_step] = (float)(t1*alpha + c[c_step]*beta); 1054 d[d_step*2] = (float)(t2*alpha + c[c_step*2]*beta); 1055 d[d_step*3] = (float)(t3*alpha + c[c_step*3]*beta); 1056 } 1057 } 1058 else 1059 break; 1060 return; 1061 } 1062 } 1063 1064 if( type == CV_64F ) 1065 { 1066 double* d = D.ptr<double>(); 1067 const double *a = A.ptr<double>(), 1068 *b = B.ptr<double>(), 1069 *c = (const double*)C.data; 1070 size_t d_step = D.step/sizeof(d[0]), 1071 a_step = A.step/sizeof(a[0]), 1072 b_step = B.step/sizeof(b[0]), 1073 c_step = C.data ? C.step/sizeof(c[0]) : 0; 1074 if( !c ) 1075 c = zero; 1076 1077 switch( len ) 1078 { 1079 case 2: 1080 if( len == d_size.width && b != d ) 1081 { 1082 for( i = 0; i < d_size.height; i++, d += d_step, a += a_step, c += c_step ) 1083 { 1084 double t0 = a[0]*b[0] + a[1]*b[b_step]; 1085 double t1 = a[0]*b[1] + a[1]*b[b_step+1]; 1086 d[0] = t0*alpha + c[0]*beta; 1087 d[1] = t1*alpha + c[1]*beta; 1088 } 1089 } 1090 else if( a != d ) 1091 { 1092 int c_step0 = 1; 1093 if( c == zero ) 1094 { 1095 c_step0 = 0; 1096 c_step = 1; 1097 } 1098 1099 for( i = 0; i < d_size.width; i++, d++, b++, c += c_step0 ) 1100 { 1101 double t0 = a[0]*b[0] + a[1]*b[b_step]; 1102 double t1 = a[a_step]*b[0] + a[a_step+1]*b[b_step]; 1103 d[0] = t0*alpha + c[0]*beta; 1104 d[d_step] = t1*alpha + c[c_step]*beta; 1105 } 1106 } 1107 else 1108 break; 1109 return; 1110 case 3: 1111 if( len == d_size.width && b != d ) 1112 { 1113 for( i = 0; i < d_size.height; i++, d += d_step, a += a_step, c += c_step ) 1114 { 1115 double t0 = a[0]*b[0] + a[1]*b[b_step] + a[2]*b[b_step*2]; 1116 double t1 = a[0]*b[1] + a[1]*b[b_step+1] + a[2]*b[b_step*2+1]; 1117 double t2 = a[0]*b[2] + a[1]*b[b_step+2] + a[2]*b[b_step*2+2]; 1118 d[0] = t0*alpha + c[0]*beta; 1119 d[1] = t1*alpha + c[1]*beta; 1120 d[2] = t2*alpha + c[2]*beta; 1121 } 1122 } 1123 else if( a != d ) 1124 { 1125 int c_step0 = 1; 1126 if( c == zero ) 1127 { 1128 c_step0 = 0; 1129 c_step = 1; 1130 } 1131 1132 for( i = 0; i < d_size.width; i++, d++, b++, c += c_step0 ) 1133 { 1134 double t0 = a[0]*b[0] + a[1]*b[b_step] + a[2]*b[b_step*2]; 1135 double t1 = a[a_step]*b[0] + a[a_step+1]*b[b_step] + a[a_step+2]*b[b_step*2]; 1136 double t2 = a[a_step*2]*b[0] + a[a_step*2+1]*b[b_step] + a[a_step*2+2]*b[b_step*2]; 1137 1138 d[0] = t0*alpha + c[0]*beta; 1139 d[d_step] = t1*alpha + c[c_step]*beta; 1140 d[d_step*2] = t2*alpha + c[c_step*2]*beta; 1141 } 1142 } 1143 else 1144 break; 1145 return; 1146 case 4: 1147 if( len == d_size.width && b != d ) 1148 { 1149 for( i = 0; i < d_size.height; i++, d += d_step, a += a_step, c += c_step ) 1150 { 1151 double t0 = a[0]*b[0] + a[1]*b[b_step] + a[2]*b[b_step*2] + a[3]*b[b_step*3]; 1152 double t1 = a[0]*b[1] + a[1]*b[b_step+1] + a[2]*b[b_step*2+1] + a[3]*b[b_step*3+1]; 1153 double t2 = a[0]*b[2] + a[1]*b[b_step+2] + a[2]*b[b_step*2+2] + a[3]*b[b_step*3+2]; 1154 double t3 = a[0]*b[3] + a[1]*b[b_step+3] + a[2]*b[b_step*2+3] + a[3]*b[b_step*3+3]; 1155 d[0] = t0*alpha + c[0]*beta; 1156 d[1] = t1*alpha + c[1]*beta; 1157 d[2] = t2*alpha + c[2]*beta; 1158 d[3] = t3*alpha + c[3]*beta; 1159 } 1160 } 1161 else if( d_size.width <= 16 && a != d ) 1162 { 1163 int c_step0 = 1; 1164 if( c == zero ) 1165 { 1166 c_step0 = 0; 1167 c_step = 1; 1168 } 1169 1170 for( i = 0; i < d_size.width; i++, d++, b++, c += c_step0 ) 1171 { 1172 double t0 = a[0]*b[0] + a[1]*b[b_step] + a[2]*b[b_step*2] + a[3]*b[b_step*3]; 1173 double t1 = a[a_step]*b[0] + a[a_step+1]*b[b_step] + 1174 a[a_step+2]*b[b_step*2] + a[a_step+3]*b[b_step*3]; 1175 double t2 = a[a_step*2]*b[0] + a[a_step*2+1]*b[b_step] + 1176 a[a_step*2+2]*b[b_step*2] + a[a_step*2+3]*b[b_step*3]; 1177 double t3 = a[a_step*3]*b[0] + a[a_step*3+1]*b[b_step] + 1178 a[a_step*3+2]*b[b_step*2] + a[a_step*3+3]*b[b_step*3]; 1179 d[0] = t0*alpha + c[0]*beta; 1180 d[d_step] = t1*alpha + c[c_step]*beta; 1181 d[d_step*2] = t2*alpha + c[c_step*2]*beta; 1182 d[d_step*3] = t3*alpha + c[c_step*3]*beta; 1183 } 1184 } 1185 else 1186 break; 1187 return; 1188 } 1189 } 1190 } 1191 1192 { 1193 size_t b_step = B.step; 1194 GEMMSingleMulFunc singleMulFunc; 1195 GEMMBlockMulFunc blockMulFunc; 1196 GEMMStoreFunc storeFunc; 1197 Mat *matD = &D, tmat; 1198 size_t tmat_size = 0; 1199 const uchar* Cdata = C.data; 1200 size_t Cstep = C.data ? (size_t)C.step : 0; 1201 AutoBuffer<uchar> buf; 1202 1203 if( type == CV_32FC1 ) 1204 { 1205 singleMulFunc = (GEMMSingleMulFunc)GEMMSingleMul_32f; 1206 blockMulFunc = (GEMMBlockMulFunc)GEMMBlockMul_32f; 1207 storeFunc = (GEMMStoreFunc)GEMMStore_32f; 1208 } 1209 else if( type == CV_64FC1 ) 1210 { 1211 singleMulFunc = (GEMMSingleMulFunc)GEMMSingleMul_64f; 1212 blockMulFunc = (GEMMBlockMulFunc)GEMMBlockMul_64f; 1213 storeFunc = (GEMMStoreFunc)GEMMStore_64f; 1214 } 1215 else if( type == CV_32FC2 ) 1216 { 1217 singleMulFunc = (GEMMSingleMulFunc)GEMMSingleMul_32fc; 1218 blockMulFunc = (GEMMBlockMulFunc)GEMMBlockMul_32fc; 1219 storeFunc = (GEMMStoreFunc)GEMMStore_32fc; 1220 } 1221 else 1222 { 1223 CV_Assert( type == CV_64FC2 ); 1224 singleMulFunc = (GEMMSingleMulFunc)GEMMSingleMul_64fc; 1225 blockMulFunc = (GEMMBlockMulFunc)GEMMBlockMul_64fc; 1226 storeFunc = (GEMMStoreFunc)GEMMStore_64fc; 1227 } 1228 1229 if( D.data == A.data || D.data == B.data ) 1230 { 1231 tmat_size = (size_t)d_size.width*d_size.height*CV_ELEM_SIZE(type); 1232 // Allocate tmat later, once the size of buf is known 1233 matD = &tmat; 1234 } 1235 1236 if( (d_size.width == 1 || len == 1) && !(flags & GEMM_2_T) && B.isContinuous() ) 1237 { 1238 b_step = d_size.width == 1 ? 0 : CV_ELEM_SIZE(type); 1239 flags |= GEMM_2_T; 1240 } 1241 1242 /*if( (d_size.width | d_size.height | len) >= 16 && icvBLAS_GEMM_32f_p != 0 ) 1243 { 1244 blas_func = type == CV_32FC1 ? (icvBLAS_GEMM_32f_t)icvBLAS_GEMM_32f_p : 1245 type == CV_64FC1 ? (icvBLAS_GEMM_32f_t)icvBLAS_GEMM_64f_p : 1246 type == CV_32FC2 ? (icvBLAS_GEMM_32f_t)icvBLAS_GEMM_32fc_p : 1247 type == CV_64FC2 ? (icvBLAS_GEMM_32f_t)icvBLAS_GEMM_64fc_p : 0; 1248 } 1249 1250 if( blas_func ) 1251 { 1252 const char* transa = flags & GEMM_1_T ? "t" : "n"; 1253 const char* transb = flags & GEMM_2_T ? "t" : "n"; 1254 int lda, ldb, ldd; 1255 1256 if( C->data.ptr ) 1257 { 1258 if( C->data.ptr != D->data.ptr ) 1259 { 1260 if( !(flags & GEMM_3_T) ) 1261 cvCopy( C, D ); 1262 else 1263 cvTranspose( C, D ); 1264 } 1265 } 1266 1267 if( CV_MAT_DEPTH(type) == CV_32F ) 1268 { 1269 Complex32f _alpha, _beta; 1270 1271 lda = A->step/sizeof(float); 1272 ldb = b_step/sizeof(float); 1273 ldd = D->step/sizeof(float); 1274 _alpha.re = (float)alpha; 1275 _alpha.im = 0; 1276 _beta.re = C->data.ptr ? (float)beta : 0; 1277 _beta.im = 0; 1278 if( CV_MAT_CN(type) == 2 ) 1279 lda /= 2, ldb /= 2, ldd /= 2; 1280 1281 blas_func( transb, transa, &d_size.width, &d_size.height, &len, 1282 &_alpha, B->data.ptr, &ldb, A->data.ptr, &lda, 1283 &_beta, D->data.ptr, &ldd ); 1284 } 1285 else 1286 { 1287 CvComplex64f _alpha, _beta; 1288 1289 lda = A->step/sizeof(double); 1290 ldb = b_step/sizeof(double); 1291 ldd = D->step/sizeof(double); 1292 _alpha.re = alpha; 1293 _alpha.im = 0; 1294 _beta.re = C->data.ptr ? beta : 0; 1295 _beta.im = 0; 1296 if( CV_MAT_CN(type) == 2 ) 1297 lda /= 2, ldb /= 2, ldd /= 2; 1298 1299 blas_func( transb, transa, &d_size.width, &d_size.height, &len, 1300 &_alpha, B->data.ptr, &ldb, A->data.ptr, &lda, 1301 &_beta, D->data.ptr, &ldd ); 1302 } 1303 } 1304 else*/ if( ((d_size.height <= block_lin_size/2 || d_size.width <= block_lin_size/2) && 1305 len <= 10000) || len <= 10 || 1306 (d_size.width <= block_lin_size && 1307 d_size.height <= block_lin_size && len <= block_lin_size) ) 1308 { 1309 if( tmat_size > 0 ) { 1310 buf.allocate(tmat_size); 1311 tmat = Mat(d_size.height, d_size.width, type, (uchar*)buf ); 1312 } 1313 singleMulFunc( A.ptr(), A.step, B.ptr(), b_step, Cdata, Cstep, 1314 matD->ptr(), matD->step, a_size, d_size, alpha, beta, flags ); 1315 } 1316 else 1317 { 1318 int is_a_t = flags & GEMM_1_T; 1319 int is_b_t = flags & GEMM_2_T; 1320 int elem_size = CV_ELEM_SIZE(type); 1321 int dk0_1, dk0_2; 1322 size_t a_buf_size = 0, b_buf_size, d_buf_size; 1323 uchar* a_buf = 0; 1324 uchar* b_buf = 0; 1325 uchar* d_buf = 0; 1326 int j, k, di = 0, dj = 0, dk = 0; 1327 int dm0, dn0, dk0; 1328 size_t a_step0, a_step1, b_step0, b_step1, c_step0, c_step1; 1329 int work_elem_size = elem_size << (CV_MAT_DEPTH(type) == CV_32F ? 1 : 0); 1330 1331 if( !is_a_t ) 1332 a_step0 = A.step, a_step1 = elem_size; 1333 else 1334 a_step0 = elem_size, a_step1 = A.step; 1335 1336 if( !is_b_t ) 1337 b_step0 = b_step, b_step1 = elem_size; 1338 else 1339 b_step0 = elem_size, b_step1 = b_step; 1340 1341 if( C.empty() ) 1342 { 1343 c_step0 = c_step1 = 0; 1344 flags &= ~GEMM_3_T; 1345 } 1346 else if( !(flags & GEMM_3_T) ) 1347 c_step0 = C.step, c_step1 = elem_size; 1348 else 1349 c_step0 = elem_size, c_step1 = C.step; 1350 1351 dm0 = std::min( block_lin_size, d_size.height ); 1352 dn0 = std::min( block_lin_size, d_size.width ); 1353 dk0_1 = block_size / dm0; 1354 dk0_2 = block_size / dn0; 1355 dk0 = std::min( dk0_1, dk0_2 ); 1356 dk0 = std::min( dk0, len ); 1357 if( dk0*dm0 > block_size ) 1358 dm0 = block_size / dk0; 1359 if( dk0*dn0 > block_size ) 1360 dn0 = block_size / dk0; 1361 1362 dk0_1 = (dn0+dn0/8+2) & -2; 1363 b_buf_size = (size_t)(dk0+dk0/8+1)*dk0_1*elem_size; 1364 d_buf_size = (size_t)(dk0+dk0/8+1)*dk0_1*work_elem_size; 1365 1366 if( is_a_t ) 1367 { 1368 a_buf_size = (size_t)(dm0+dm0/8+1)*((dk0+dk0/8+2)&-2)*elem_size; 1369 flags &= ~GEMM_1_T; 1370 } 1371 1372 buf.allocate(d_buf_size + b_buf_size + a_buf_size + tmat_size); 1373 d_buf = (uchar*)buf; 1374 b_buf = d_buf + d_buf_size; 1375 1376 if( is_a_t ) 1377 a_buf = b_buf + b_buf_size; 1378 if( tmat_size > 0 ) 1379 tmat = Mat(d_size.height, d_size.width, type, b_buf + b_buf_size + a_buf_size ); 1380 1381 for( i = 0; i < d_size.height; i += di ) 1382 { 1383 di = dm0; 1384 if( i + di >= d_size.height || 8*(i + di) + di > 8*d_size.height ) 1385 di = d_size.height - i; 1386 1387 for( j = 0; j < d_size.width; j += dj ) 1388 { 1389 uchar* _d = matD->ptr() + i*matD->step + j*elem_size; 1390 const uchar* _c = Cdata + i*c_step0 + j*c_step1; 1391 size_t _d_step = matD->step; 1392 dj = dn0; 1393 1394 if( j + dj >= d_size.width || 8*(j + dj) + dj > 8*d_size.width ) 1395 dj = d_size.width - j; 1396 1397 flags &= 15; 1398 if( dk0 < len ) 1399 { 1400 _d = d_buf; 1401 _d_step = dj*work_elem_size; 1402 } 1403 1404 for( k = 0; k < len; k += dk ) 1405 { 1406 const uchar* _a = A.ptr() + i*a_step0 + k*a_step1; 1407 size_t _a_step = A.step; 1408 const uchar* _b = B.ptr() + k*b_step0 + j*b_step1; 1409 size_t _b_step = b_step; 1410 Size a_bl_size; 1411 1412 dk = dk0; 1413 if( k + dk >= len || 8*(k + dk) + dk > 8*len ) 1414 dk = len - k; 1415 1416 if( !is_a_t ) 1417 a_bl_size.width = dk, a_bl_size.height = di; 1418 else 1419 a_bl_size.width = di, a_bl_size.height = dk; 1420 1421 if( a_buf && is_a_t ) 1422 { 1423 _a_step = dk*elem_size; 1424 GEMM_TransposeBlock( _a, A.step, a_buf, _a_step, a_bl_size, elem_size ); 1425 std::swap( a_bl_size.width, a_bl_size.height ); 1426 _a = a_buf; 1427 } 1428 1429 if( dj < d_size.width ) 1430 { 1431 Size b_size; 1432 if( !is_b_t ) 1433 b_size.width = dj, b_size.height = dk; 1434 else 1435 b_size.width = dk, b_size.height = dj; 1436 1437 _b_step = b_size.width*elem_size; 1438 GEMM_CopyBlock( _b, b_step, b_buf, _b_step, b_size, elem_size ); 1439 _b = b_buf; 1440 } 1441 1442 if( dk0 < len ) 1443 blockMulFunc( _a, _a_step, _b, _b_step, _d, _d_step, 1444 a_bl_size, Size(dj,di), flags ); 1445 else 1446 singleMulFunc( _a, _a_step, _b, _b_step, _c, Cstep, 1447 _d, _d_step, a_bl_size, Size(dj,di), alpha, beta, flags ); 1448 flags |= 16; 1449 } 1450 1451 if( dk0 < len ) 1452 storeFunc( _c, Cstep, _d, _d_step, 1453 matD->ptr(i) + j*elem_size, 1454 matD->step, Size(dj,di), alpha, beta, flags ); 1455 } 1456 } 1457 } 1458 1459 if( matD != &D ) 1460 matD->copyTo(D); 1461 } 1462 } 1463 1464 /****************************************************************************************\ 1465 * Transform * 1466 \****************************************************************************************/ 1467 1468 namespace cv 1469 { 1470 1471 template<typename T, typename WT> static void 1472 transform_( const T* src, T* dst, const WT* m, int len, int scn, int dcn ) 1473 { 1474 int x; 1475 1476 if( scn == 2 && dcn == 2 ) 1477 { 1478 for( x = 0; x < len*2; x += 2 ) 1479 { 1480 WT v0 = src[x], v1 = src[x+1]; 1481 T t0 = saturate_cast<T>(m[0]*v0 + m[1]*v1 + m[2]); 1482 T t1 = saturate_cast<T>(m[3]*v0 + m[4]*v1 + m[5]); 1483 dst[x] = t0; dst[x+1] = t1; 1484 } 1485 } 1486 else if( scn == 3 && dcn == 3 ) 1487 { 1488 for( x = 0; x < len*3; x += 3 ) 1489 { 1490 WT v0 = src[x], v1 = src[x+1], v2 = src[x+2]; 1491 T t0 = saturate_cast<T>(m[0]*v0 + m[1]*v1 + m[2]*v2 + m[3]); 1492 T t1 = saturate_cast<T>(m[4]*v0 + m[5]*v1 + m[6]*v2 + m[7]); 1493 T t2 = saturate_cast<T>(m[8]*v0 + m[9]*v1 + m[10]*v2 + m[11]); 1494 dst[x] = t0; dst[x+1] = t1; dst[x+2] = t2; 1495 } 1496 } 1497 else if( scn == 3 && dcn == 1 ) 1498 { 1499 for( x = 0; x < len; x++, src += 3 ) 1500 dst[x] = saturate_cast<T>(m[0]*src[0] + m[1]*src[1] + m[2]*src[2] + m[3]); 1501 } 1502 else if( scn == 4 && dcn == 4 ) 1503 { 1504 for( x = 0; x < len*4; x += 4 ) 1505 { 1506 WT v0 = src[x], v1 = src[x+1], v2 = src[x+2], v3 = src[x+3]; 1507 T t0 = saturate_cast<T>(m[0]*v0 + m[1]*v1 + m[2]*v2 + m[3]*v3 + m[4]); 1508 T t1 = saturate_cast<T>(m[5]*v0 + m[6]*v1 + m[7]*v2 + m[8]*v3 + m[9]); 1509 dst[x] = t0; dst[x+1] = t1; 1510 t0 = saturate_cast<T>(m[10]*v0 + m[11]*v1 + m[12]*v2 + m[13]*v3 + m[14]); 1511 t1 = saturate_cast<T>(m[15]*v0 + m[16]*v1 + m[17]*v2 + m[18]*v3 + m[19]); 1512 dst[x+2] = t0; dst[x+3] = t1; 1513 } 1514 } 1515 else 1516 { 1517 for( x = 0; x < len; x++, src += scn, dst += dcn ) 1518 { 1519 const WT* _m = m; 1520 int j, k; 1521 for( j = 0; j < dcn; j++, _m += scn + 1 ) 1522 { 1523 WT s = _m[scn]; 1524 for( k = 0; k < scn; k++ ) 1525 s += _m[k]*src[k]; 1526 dst[j] = saturate_cast<T>(s); 1527 } 1528 } 1529 } 1530 } 1531 1532 #if CV_SSE2 1533 1534 static inline void 1535 load3x3Matrix( const float* m, __m128& m0, __m128& m1, __m128& m2, __m128& m3 ) 1536 { 1537 m0 = _mm_setr_ps(m[0], m[4], m[8], 0); 1538 m1 = _mm_setr_ps(m[1], m[5], m[9], 0); 1539 m2 = _mm_setr_ps(m[2], m[6], m[10], 0); 1540 m3 = _mm_setr_ps(m[3], m[7], m[11], 0); 1541 } 1542 1543 static inline void 1544 load4x4Matrix( const float* m, __m128& m0, __m128& m1, __m128& m2, __m128& m3, __m128& m4 ) 1545 { 1546 m0 = _mm_setr_ps(m[0], m[5], m[10], m[15]); 1547 m1 = _mm_setr_ps(m[1], m[6], m[11], m[16]); 1548 m2 = _mm_setr_ps(m[2], m[7], m[12], m[17]); 1549 m3 = _mm_setr_ps(m[3], m[8], m[13], m[18]); 1550 m4 = _mm_setr_ps(m[4], m[9], m[14], m[19]); 1551 } 1552 1553 #endif 1554 1555 static void 1556 transform_8u( const uchar* src, uchar* dst, const float* m, int len, int scn, int dcn ) 1557 { 1558 #if CV_SSE2 1559 const int BITS = 10, SCALE = 1 << BITS; 1560 const float MAX_M = (float)(1 << (15 - BITS)); 1561 1562 if( USE_SSE2 && scn == 3 && dcn == 3 && 1563 std::abs(m[0]) < MAX_M && std::abs(m[1]) < MAX_M && std::abs(m[2]) < MAX_M && std::abs(m[3]) < MAX_M*256 && 1564 std::abs(m[4]) < MAX_M && std::abs(m[5]) < MAX_M && std::abs(m[6]) < MAX_M && std::abs(m[7]) < MAX_M*256 && 1565 std::abs(m[8]) < MAX_M && std::abs(m[9]) < MAX_M && std::abs(m[10]) < MAX_M && std::abs(m[11]) < MAX_M*256 ) 1566 { 1567 // faster fixed-point transformation 1568 short m00 = saturate_cast<short>(m[0]*SCALE), m01 = saturate_cast<short>(m[1]*SCALE), 1569 m02 = saturate_cast<short>(m[2]*SCALE), m10 = saturate_cast<short>(m[4]*SCALE), 1570 m11 = saturate_cast<short>(m[5]*SCALE), m12 = saturate_cast<short>(m[6]*SCALE), 1571 m20 = saturate_cast<short>(m[8]*SCALE), m21 = saturate_cast<short>(m[9]*SCALE), 1572 m22 = saturate_cast<short>(m[10]*SCALE); 1573 int m03 = saturate_cast<int>((m[3]+0.5f)*SCALE), m13 = saturate_cast<int>((m[7]+0.5f)*SCALE ), 1574 m23 = saturate_cast<int>((m[11]+0.5f)*SCALE); 1575 1576 __m128i m0 = _mm_setr_epi16(0, m00, m01, m02, m00, m01, m02, 0); 1577 __m128i m1 = _mm_setr_epi16(0, m10, m11, m12, m10, m11, m12, 0); 1578 __m128i m2 = _mm_setr_epi16(0, m20, m21, m22, m20, m21, m22, 0); 1579 __m128i m3 = _mm_setr_epi32(m03, m13, m23, 0); 1580 int x = 0; 1581 1582 for( ; x <= (len - 8)*3; x += 8*3 ) 1583 { 1584 __m128i z = _mm_setzero_si128(), t0, t1, t2, r0, r1; 1585 __m128i v0 = _mm_loadl_epi64((const __m128i*)(src + x)); 1586 __m128i v1 = _mm_loadl_epi64((const __m128i*)(src + x + 8)); 1587 __m128i v2 = _mm_loadl_epi64((const __m128i*)(src + x + 16)), v3; 1588 v0 = _mm_unpacklo_epi8(v0, z); // b0 g0 r0 b1 g1 r1 b2 g2 1589 v1 = _mm_unpacklo_epi8(v1, z); // r2 b3 g3 r3 b4 g4 r4 b5 1590 v2 = _mm_unpacklo_epi8(v2, z); // g5 r5 b6 g6 r6 b7 g7 r7 1591 1592 v3 = _mm_srli_si128(v2, 2); // ? b6 g6 r6 b7 g7 r7 0 1593 v2 = _mm_or_si128(_mm_slli_si128(v2, 10), _mm_srli_si128(v1, 6)); // ? b4 g4 r4 b5 g5 r5 ? 1594 v1 = _mm_or_si128(_mm_slli_si128(v1, 6), _mm_srli_si128(v0, 10)); // ? b2 g2 r2 b3 g3 r3 ? 1595 v0 = _mm_slli_si128(v0, 2); // 0 b0 g0 r0 b1 g1 r1 ? 1596 1597 // process pixels 0 & 1 1598 t0 = _mm_madd_epi16(v0, m0); // a0 b0 a1 b1 1599 t1 = _mm_madd_epi16(v0, m1); // c0 d0 c1 d1 1600 t2 = _mm_madd_epi16(v0, m2); // e0 f0 e1 f1 1601 v0 = _mm_unpacklo_epi32(t0, t1); // a0 c0 b0 d0 1602 t0 = _mm_unpackhi_epi32(t0, t1); // a1 b1 c1 d1 1603 t1 = _mm_unpacklo_epi32(t2, z); // e0 0 f0 0 1604 t2 = _mm_unpackhi_epi32(t2, z); // e1 0 f1 0 1605 r0 = _mm_add_epi32(_mm_add_epi32(_mm_unpacklo_epi64(v0, t1), _mm_unpackhi_epi64(v0,t1)), m3); // B0 G0 R0 0 1606 r1 = _mm_add_epi32(_mm_add_epi32(_mm_unpacklo_epi64(t0, t2), _mm_unpackhi_epi64(t0,t2)), m3); // B1 G1 R1 0 1607 r0 = _mm_srai_epi32(r0, BITS); 1608 r1 = _mm_srai_epi32(r1, BITS); 1609 v0 = _mm_packus_epi16(_mm_packs_epi32(_mm_slli_si128(r0, 4), r1), z); // 0 B0 G0 R0 B1 G1 R1 0 1610 1611 // process pixels 2 & 3 1612 t0 = _mm_madd_epi16(v1, m0); // a0 b0 a1 b1 1613 t1 = _mm_madd_epi16(v1, m1); // c0 d0 c1 d1 1614 t2 = _mm_madd_epi16(v1, m2); // e0 f0 e1 f1 1615 v1 = _mm_unpacklo_epi32(t0, t1); // a0 c0 b0 d0 1616 t0 = _mm_unpackhi_epi32(t0, t1); // a1 b1 c1 d1 1617 t1 = _mm_unpacklo_epi32(t2, z); // e0 0 f0 0 1618 t2 = _mm_unpackhi_epi32(t2, z); // e1 0 f1 0 1619 r0 = _mm_add_epi32(_mm_add_epi32(_mm_unpacklo_epi64(v1, t1), _mm_unpackhi_epi64(v1,t1)), m3); // B2 G2 R2 0 1620 r1 = _mm_add_epi32(_mm_add_epi32(_mm_unpacklo_epi64(t0, t2), _mm_unpackhi_epi64(t0,t2)), m3); // B3 G3 R3 0 1621 r0 = _mm_srai_epi32(r0, BITS); 1622 r1 = _mm_srai_epi32(r1, BITS); 1623 v1 = _mm_packus_epi16(_mm_packs_epi32(_mm_slli_si128(r0, 4), r1), z); // 0 B2 G2 R2 B3 G3 R3 0 1624 1625 // process pixels 4 & 5 1626 t0 = _mm_madd_epi16(v2, m0); // a0 b0 a1 b1 1627 t1 = _mm_madd_epi16(v2, m1); // c0 d0 c1 d1 1628 t2 = _mm_madd_epi16(v2, m2); // e0 f0 e1 f1 1629 v2 = _mm_unpacklo_epi32(t0, t1); // a0 c0 b0 d0 1630 t0 = _mm_unpackhi_epi32(t0, t1); // a1 b1 c1 d1 1631 t1 = _mm_unpacklo_epi32(t2, z); // e0 0 f0 0 1632 t2 = _mm_unpackhi_epi32(t2, z); // e1 0 f1 0 1633 r0 = _mm_add_epi32(_mm_add_epi32(_mm_unpacklo_epi64(v2, t1), _mm_unpackhi_epi64(v2,t1)), m3); // B4 G4 R4 0 1634 r1 = _mm_add_epi32(_mm_add_epi32(_mm_unpacklo_epi64(t0, t2), _mm_unpackhi_epi64(t0,t2)), m3); // B5 G5 R5 0 1635 r0 = _mm_srai_epi32(r0, BITS); 1636 r1 = _mm_srai_epi32(r1, BITS); 1637 v2 = _mm_packus_epi16(_mm_packs_epi32(_mm_slli_si128(r0, 4), r1), z); // 0 B4 G4 R4 B5 G5 R5 0 1638 1639 // process pixels 6 & 7 1640 t0 = _mm_madd_epi16(v3, m0); // a0 b0 a1 b1 1641 t1 = _mm_madd_epi16(v3, m1); // c0 d0 c1 d1 1642 t2 = _mm_madd_epi16(v3, m2); // e0 f0 e1 f1 1643 v3 = _mm_unpacklo_epi32(t0, t1); // a0 c0 b0 d0 1644 t0 = _mm_unpackhi_epi32(t0, t1); // a1 b1 c1 d1 1645 t1 = _mm_unpacklo_epi32(t2, z); // e0 0 f0 0 1646 t2 = _mm_unpackhi_epi32(t2, z); // e1 0 f1 0 1647 r0 = _mm_add_epi32(_mm_add_epi32(_mm_unpacklo_epi64(v3, t1), _mm_unpackhi_epi64(v3,t1)), m3); // B6 G6 R6 0 1648 r1 = _mm_add_epi32(_mm_add_epi32(_mm_unpacklo_epi64(t0, t2), _mm_unpackhi_epi64(t0,t2)), m3); // B7 G7 R7 0 1649 r0 = _mm_srai_epi32(r0, BITS); 1650 r1 = _mm_srai_epi32(r1, BITS); 1651 v3 = _mm_packus_epi16(_mm_packs_epi32(_mm_slli_si128(r0, 4), r1), z); // 0 B6 G6 R6 B7 G7 R7 0 1652 1653 v0 = _mm_or_si128(_mm_srli_si128(v0, 1), _mm_slli_si128(v1, 5)); 1654 v1 = _mm_or_si128(_mm_srli_si128(v1, 3), _mm_slli_si128(v2, 3)); 1655 v2 = _mm_or_si128(_mm_srli_si128(v2, 5), _mm_slli_si128(v3, 1)); 1656 _mm_storel_epi64((__m128i*)(dst + x), v0); 1657 _mm_storel_epi64((__m128i*)(dst + x + 8), v1); 1658 _mm_storel_epi64((__m128i*)(dst + x + 16), v2); 1659 } 1660 1661 for( ; x < len*3; x += 3 ) 1662 { 1663 int v0 = src[x], v1 = src[x+1], v2 = src[x+2]; 1664 uchar t0 = saturate_cast<uchar>((m00*v0 + m01*v1 + m02*v2 + m03)>>BITS); 1665 uchar t1 = saturate_cast<uchar>((m10*v0 + m11*v1 + m12*v2 + m13)>>BITS); 1666 uchar t2 = saturate_cast<uchar>((m20*v0 + m21*v1 + m22*v2 + m23)>>BITS); 1667 dst[x] = t0; dst[x+1] = t1; dst[x+2] = t2; 1668 } 1669 return; 1670 } 1671 #endif 1672 1673 transform_(src, dst, m, len, scn, dcn); 1674 } 1675 1676 static void 1677 transform_16u( const ushort* src, ushort* dst, const float* m, int len, int scn, int dcn ) 1678 { 1679 #if CV_SSE2 1680 if( USE_SSE2 && scn == 3 && dcn == 3 ) 1681 { 1682 __m128 m0, m1, m2, m3; 1683 __m128i delta = _mm_setr_epi16(0,-32768,-32768,-32768,-32768,-32768,-32768,0); 1684 load3x3Matrix(m, m0, m1, m2, m3); 1685 m3 = _mm_sub_ps(m3, _mm_setr_ps(32768.f, 32768.f, 32768.f, 0.f)); 1686 1687 int x = 0; 1688 for( ; x <= (len - 4)*3; x += 4*3 ) 1689 { 1690 __m128i z = _mm_setzero_si128(); 1691 __m128i v0 = _mm_loadu_si128((const __m128i*)(src + x)), v1; 1692 __m128i v2 = _mm_loadl_epi64((const __m128i*)(src + x + 8)), v3; 1693 v1 = _mm_unpacklo_epi16(_mm_srli_si128(v0, 6), z); // b1 g1 r1 1694 v3 = _mm_unpacklo_epi16(_mm_srli_si128(v2, 2), z); // b3 g3 r3 1695 v2 = _mm_or_si128(_mm_srli_si128(v0, 12), _mm_slli_si128(v2, 4)); 1696 v0 = _mm_unpacklo_epi16(v0, z); // b0 g0 r0 1697 v2 = _mm_unpacklo_epi16(v2, z); // b2 g2 r2 1698 __m128 x0 = _mm_cvtepi32_ps(v0), x1 = _mm_cvtepi32_ps(v1); 1699 __m128 x2 = _mm_cvtepi32_ps(v2), x3 = _mm_cvtepi32_ps(v3); 1700 __m128 y0 = _mm_add_ps(_mm_add_ps(_mm_add_ps( 1701 _mm_mul_ps(m0, _mm_shuffle_ps(x0,x0,_MM_SHUFFLE(0,0,0,0))), 1702 _mm_mul_ps(m1, _mm_shuffle_ps(x0,x0,_MM_SHUFFLE(1,1,1,1)))), 1703 _mm_mul_ps(m2, _mm_shuffle_ps(x0,x0,_MM_SHUFFLE(2,2,2,2)))), m3); 1704 __m128 y1 = _mm_add_ps(_mm_add_ps(_mm_add_ps( 1705 _mm_mul_ps(m0, _mm_shuffle_ps(x1,x1,_MM_SHUFFLE(0,0,0,0))), 1706 _mm_mul_ps(m1, _mm_shuffle_ps(x1,x1,_MM_SHUFFLE(1,1,1,1)))), 1707 _mm_mul_ps(m2, _mm_shuffle_ps(x1,x1,_MM_SHUFFLE(2,2,2,2)))), m3); 1708 __m128 y2 = _mm_add_ps(_mm_add_ps(_mm_add_ps( 1709 _mm_mul_ps(m0, _mm_shuffle_ps(x2,x2,_MM_SHUFFLE(0,0,0,0))), 1710 _mm_mul_ps(m1, _mm_shuffle_ps(x2,x2,_MM_SHUFFLE(1,1,1,1)))), 1711 _mm_mul_ps(m2, _mm_shuffle_ps(x2,x2,_MM_SHUFFLE(2,2,2,2)))), m3); 1712 __m128 y3 = _mm_add_ps(_mm_add_ps(_mm_add_ps( 1713 _mm_mul_ps(m0, _mm_shuffle_ps(x3,x3,_MM_SHUFFLE(0,0,0,0))), 1714 _mm_mul_ps(m1, _mm_shuffle_ps(x3,x3,_MM_SHUFFLE(1,1,1,1)))), 1715 _mm_mul_ps(m2, _mm_shuffle_ps(x3,x3,_MM_SHUFFLE(2,2,2,2)))), m3); 1716 v0 = _mm_cvtps_epi32(y0); v1 = _mm_cvtps_epi32(y1); 1717 v2 = _mm_cvtps_epi32(y2); v3 = _mm_cvtps_epi32(y3); 1718 1719 v0 = _mm_add_epi16(_mm_packs_epi32(_mm_slli_si128(v0,4), v1), delta); // 0 b0 g0 r0 b1 g1 r1 0 1720 v2 = _mm_add_epi16(_mm_packs_epi32(_mm_slli_si128(v2,4), v3), delta); // 0 b2 g2 r2 b3 g3 r3 0 1721 v1 = _mm_or_si128(_mm_srli_si128(v0,2), _mm_slli_si128(v2,10)); // b0 g0 r0 b1 g1 r1 b2 g2 1722 v2 = _mm_srli_si128(v2, 6); // r2 b3 g3 r3 0 0 0 0 1723 _mm_storeu_si128((__m128i*)(dst + x), v1); 1724 _mm_storel_epi64((__m128i*)(dst + x + 8), v2); 1725 } 1726 1727 for( ; x < len*3; x += 3 ) 1728 { 1729 float v0 = src[x], v1 = src[x+1], v2 = src[x+2]; 1730 ushort t0 = saturate_cast<ushort>(m[0]*v0 + m[1]*v1 + m[2]*v2 + m[3]); 1731 ushort t1 = saturate_cast<ushort>(m[4]*v0 + m[5]*v1 + m[6]*v2 + m[7]); 1732 ushort t2 = saturate_cast<ushort>(m[8]*v0 + m[9]*v1 + m[10]*v2 + m[11]); 1733 dst[x] = t0; dst[x+1] = t1; dst[x+2] = t2; 1734 } 1735 return; 1736 } 1737 #endif 1738 1739 transform_(src, dst, m, len, scn, dcn); 1740 } 1741 1742 1743 static void 1744 transform_32f( const float* src, float* dst, const float* m, int len, int scn, int dcn ) 1745 { 1746 #if CV_SSE2 1747 if( USE_SSE2 ) 1748 { 1749 int x = 0; 1750 if( scn == 3 && dcn == 3 ) 1751 { 1752 __m128 m0, m1, m2, m3; 1753 load3x3Matrix(m, m0, m1, m2, m3); 1754 1755 for( ; x < (len - 1)*3; x += 3 ) 1756 { 1757 __m128 x0 = _mm_loadu_ps(src + x); 1758 __m128 y0 = _mm_add_ps(_mm_add_ps(_mm_add_ps( 1759 _mm_mul_ps(m0, _mm_shuffle_ps(x0,x0,_MM_SHUFFLE(0,0,0,0))), 1760 _mm_mul_ps(m1, _mm_shuffle_ps(x0,x0,_MM_SHUFFLE(1,1,1,1)))), 1761 _mm_mul_ps(m2, _mm_shuffle_ps(x0,x0,_MM_SHUFFLE(2,2,2,2)))), m3); 1762 _mm_storel_pi((__m64*)(dst + x), y0); 1763 _mm_store_ss(dst + x + 2, _mm_movehl_ps(y0,y0)); 1764 } 1765 1766 for( ; x < len*3; x += 3 ) 1767 { 1768 float v0 = src[x], v1 = src[x+1], v2 = src[x+2]; 1769 float t0 = saturate_cast<float>(m[0]*v0 + m[1]*v1 + m[2]*v2 + m[3]); 1770 float t1 = saturate_cast<float>(m[4]*v0 + m[5]*v1 + m[6]*v2 + m[7]); 1771 float t2 = saturate_cast<float>(m[8]*v0 + m[9]*v1 + m[10]*v2 + m[11]); 1772 dst[x] = t0; dst[x+1] = t1; dst[x+2] = t2; 1773 } 1774 return; 1775 } 1776 1777 if( scn == 4 && dcn == 4 ) 1778 { 1779 __m128 m0, m1, m2, m3, m4; 1780 load4x4Matrix(m, m0, m1, m2, m3, m4); 1781 1782 for( ; x < len*4; x += 4 ) 1783 { 1784 __m128 x0 = _mm_loadu_ps(src + x); 1785 __m128 y0 = _mm_add_ps(_mm_add_ps(_mm_add_ps(_mm_add_ps( 1786 _mm_mul_ps(m0, _mm_shuffle_ps(x0,x0,_MM_SHUFFLE(0,0,0,0))), 1787 _mm_mul_ps(m1, _mm_shuffle_ps(x0,x0,_MM_SHUFFLE(1,1,1,1)))), 1788 _mm_mul_ps(m2, _mm_shuffle_ps(x0,x0,_MM_SHUFFLE(2,2,2,2)))), 1789 _mm_mul_ps(m3, _mm_shuffle_ps(x0,x0,_MM_SHUFFLE(3,3,3,3)))), m4); 1790 _mm_storeu_ps(dst + x, y0); 1791 } 1792 return; 1793 } 1794 } 1795 #endif 1796 1797 transform_(src, dst, m, len, scn, dcn); 1798 } 1799 1800 1801 static void 1802 transform_8s(const schar* src, schar* dst, const float* m, int len, int scn, int dcn) 1803 { 1804 transform_(src, dst, m, len, scn, dcn); 1805 } 1806 1807 static void 1808 transform_16s(const short* src, short* dst, const float* m, int len, int scn, int dcn) 1809 { 1810 transform_(src, dst, m, len, scn, dcn); 1811 } 1812 1813 static void 1814 transform_32s(const int* src, int* dst, const double* m, int len, int scn, int dcn) 1815 { 1816 transform_(src, dst, m, len, scn, dcn); 1817 } 1818 1819 static void 1820 transform_64f(const double* src, double* dst, const double* m, int len, int scn, int dcn) 1821 { 1822 transform_(src, dst, m, len, scn, dcn); 1823 } 1824 1825 template<typename T, typename WT> static void 1826 diagtransform_( const T* src, T* dst, const WT* m, int len, int cn, int ) 1827 { 1828 int x; 1829 1830 if( cn == 2 ) 1831 { 1832 for( x = 0; x < len*2; x += 2 ) 1833 { 1834 T t0 = saturate_cast<T>(m[0]*src[x] + m[2]); 1835 T t1 = saturate_cast<T>(m[4]*src[x+1] + m[5]); 1836 dst[x] = t0; dst[x+1] = t1; 1837 } 1838 } 1839 else if( cn == 3 ) 1840 { 1841 for( x = 0; x < len*3; x += 3 ) 1842 { 1843 T t0 = saturate_cast<T>(m[0]*src[x] + m[3]); 1844 T t1 = saturate_cast<T>(m[5]*src[x+1] + m[7]); 1845 T t2 = saturate_cast<T>(m[10]*src[x+2] + m[11]); 1846 dst[x] = t0; dst[x+1] = t1; dst[x+2] = t2; 1847 } 1848 } 1849 else if( cn == 4 ) 1850 { 1851 for( x = 0; x < len*4; x += 4 ) 1852 { 1853 T t0 = saturate_cast<T>(m[0]*src[x] + m[4]); 1854 T t1 = saturate_cast<T>(m[6]*src[x+1] + m[9]); 1855 dst[x] = t0; dst[x+1] = t1; 1856 t0 = saturate_cast<T>(m[12]*src[x+2] + m[14]); 1857 t1 = saturate_cast<T>(m[18]*src[x+3] + m[19]); 1858 dst[x+2] = t0; dst[x+3] = t1; 1859 } 1860 } 1861 else 1862 { 1863 for( x = 0; x < len; x++, src += cn, dst += cn ) 1864 { 1865 const WT* _m = m; 1866 for( int j = 0; j < cn; j++, _m += cn + 1 ) 1867 dst[j] = saturate_cast<T>(src[j]*_m[j] + _m[cn]); 1868 } 1869 } 1870 } 1871 1872 static void 1873 diagtransform_8u(const uchar* src, uchar* dst, const float* m, int len, int scn, int dcn) 1874 { 1875 diagtransform_(src, dst, m, len, scn, dcn); 1876 } 1877 1878 static void 1879 diagtransform_8s(const schar* src, schar* dst, const float* m, int len, int scn, int dcn) 1880 { 1881 diagtransform_(src, dst, m, len, scn, dcn); 1882 } 1883 1884 static void 1885 diagtransform_16u(const ushort* src, ushort* dst, const float* m, int len, int scn, int dcn) 1886 { 1887 diagtransform_(src, dst, m, len, scn, dcn); 1888 } 1889 1890 static void 1891 diagtransform_16s(const short* src, short* dst, const float* m, int len, int scn, int dcn) 1892 { 1893 diagtransform_(src, dst, m, len, scn, dcn); 1894 } 1895 1896 static void 1897 diagtransform_32s(const int* src, int* dst, const double* m, int len, int scn, int dcn) 1898 { 1899 diagtransform_(src, dst, m, len, scn, dcn); 1900 } 1901 1902 static void 1903 diagtransform_32f(const float* src, float* dst, const float* m, int len, int scn, int dcn) 1904 { 1905 diagtransform_(src, dst, m, len, scn, dcn); 1906 } 1907 1908 static void 1909 diagtransform_64f(const double* src, double* dst, const double* m, int len, int scn, int dcn) 1910 { 1911 diagtransform_(src, dst, m, len, scn, dcn); 1912 } 1913 1914 1915 typedef void (*TransformFunc)( const uchar* src, uchar* dst, const uchar* m, int, int, int ); 1916 1917 static TransformFunc getTransformFunc(int depth) 1918 { 1919 static TransformFunc transformTab[] = 1920 { 1921 (TransformFunc)transform_8u, (TransformFunc)transform_8s, (TransformFunc)transform_16u, 1922 (TransformFunc)transform_16s, (TransformFunc)transform_32s, (TransformFunc)transform_32f, 1923 (TransformFunc)transform_64f, 0 1924 }; 1925 1926 return transformTab[depth]; 1927 } 1928 1929 static TransformFunc getDiagTransformFunc(int depth) 1930 { 1931 static TransformFunc diagTransformTab[] = 1932 { 1933 (TransformFunc)diagtransform_8u, (TransformFunc)diagtransform_8s, (TransformFunc)diagtransform_16u, 1934 (TransformFunc)diagtransform_16s, (TransformFunc)diagtransform_32s, (TransformFunc)diagtransform_32f, 1935 (TransformFunc)diagtransform_64f, 0 1936 }; 1937 1938 return diagTransformTab[depth]; 1939 } 1940 1941 } 1942 1943 void cv::transform( InputArray _src, OutputArray _dst, InputArray _mtx ) 1944 { 1945 Mat src = _src.getMat(), m = _mtx.getMat(); 1946 int depth = src.depth(), scn = src.channels(), dcn = m.rows; 1947 CV_Assert( scn == m.cols || scn + 1 == m.cols ); 1948 bool isDiag = false; 1949 1950 _dst.create( src.size(), CV_MAKETYPE(depth, dcn) ); 1951 Mat dst = _dst.getMat(); 1952 1953 int mtype = depth == CV_32S || depth == CV_64F ? CV_64F : CV_32F; 1954 AutoBuffer<double> _mbuf; 1955 double* mbuf; 1956 1957 if( !m.isContinuous() || m.type() != mtype || m.cols != scn + 1 ) 1958 { 1959 _mbuf.allocate(dcn*(scn+1)); 1960 mbuf = (double*)_mbuf; 1961 Mat tmp(dcn, scn+1, mtype, mbuf); 1962 memset(tmp.ptr(), 0, tmp.total()*tmp.elemSize()); 1963 if( m.cols == scn+1 ) 1964 m.convertTo(tmp, mtype); 1965 else 1966 { 1967 Mat tmppart = tmp.colRange(0, m.cols); 1968 m.convertTo(tmppart, mtype); 1969 } 1970 m = tmp; 1971 } 1972 else 1973 mbuf = m.ptr<double>(); 1974 1975 if( scn == dcn ) 1976 { 1977 int i, j; 1978 double eps = mtype == CV_32F ? FLT_EPSILON : DBL_EPSILON; 1979 1980 if( scn == 1 ) 1981 { 1982 double alpha, beta; 1983 if( mtype == CV_32F ) 1984 alpha = m.at<float>(0), beta = m.at<float>(1); 1985 else 1986 alpha = m.at<double>(0), beta = m.at<double>(1); 1987 src.convertTo(dst, dst.type(), alpha, beta); 1988 return; 1989 } 1990 1991 for( i = 0, isDiag = true; isDiag && i < scn; i++ ) 1992 { 1993 for( j = 0; isDiag && j < scn; j++ ) 1994 { 1995 double v = mtype == CV_32F ? m.at<float>(i, j) : m.at<double>(i, j); 1996 if( i != j && fabs(v) > eps ) 1997 isDiag = false; 1998 } 1999 } 2000 } 2001 2002 TransformFunc func = isDiag ? getDiagTransformFunc(depth): getTransformFunc(depth); 2003 CV_Assert( func != 0 ); 2004 2005 const Mat* arrays[] = {&src, &dst, 0}; 2006 uchar* ptrs[2]; 2007 NAryMatIterator it(arrays, ptrs); 2008 size_t i, total = it.size; 2009 2010 for( i = 0; i < it.nplanes; i++, ++it ) 2011 func( ptrs[0], ptrs[1], (uchar*)mbuf, (int)total, scn, dcn ); 2012 } 2013 2014 /****************************************************************************************\ 2015 * Perspective Transform * 2016 \****************************************************************************************/ 2017 2018 namespace cv 2019 { 2020 2021 template<typename T> static void 2022 perspectiveTransform_( const T* src, T* dst, const double* m, int len, int scn, int dcn ) 2023 { 2024 const double eps = FLT_EPSILON; 2025 int i; 2026 2027 if( scn == 2 && dcn == 2 ) 2028 { 2029 for( i = 0; i < len*2; i += 2 ) 2030 { 2031 T x = src[i], y = src[i + 1]; 2032 double w = x*m[6] + y*m[7] + m[8]; 2033 2034 if( fabs(w) > eps ) 2035 { 2036 w = 1./w; 2037 dst[i] = (T)((x*m[0] + y*m[1] + m[2])*w); 2038 dst[i+1] = (T)((x*m[3] + y*m[4] + m[5])*w); 2039 } 2040 else 2041 dst[i] = dst[i+1] = (T)0; 2042 } 2043 } 2044 else if( scn == 3 && dcn == 3 ) 2045 { 2046 for( i = 0; i < len*3; i += 3 ) 2047 { 2048 T x = src[i], y = src[i + 1], z = src[i + 2]; 2049 double w = x*m[12] + y*m[13] + z*m[14] + m[15]; 2050 2051 if( fabs(w) > eps ) 2052 { 2053 w = 1./w; 2054 dst[i] = (T)((x*m[0] + y*m[1] + z*m[2] + m[3]) * w); 2055 dst[i+1] = (T)((x*m[4] + y*m[5] + z*m[6] + m[7]) * w); 2056 dst[i+2] = (T)((x*m[8] + y*m[9] + z*m[10] + m[11]) * w); 2057 } 2058 else 2059 dst[i] = dst[i+1] = dst[i+2] = (T)0; 2060 } 2061 } 2062 else if( scn == 3 && dcn == 2 ) 2063 { 2064 for( i = 0; i < len; i++, src += 3, dst += 2 ) 2065 { 2066 T x = src[0], y = src[1], z = src[2]; 2067 double w = x*m[8] + y*m[9] + z*m[10] + m[11]; 2068 2069 if( fabs(w) > eps ) 2070 { 2071 w = 1./w; 2072 dst[0] = (T)((x*m[0] + y*m[1] + z*m[2] + m[3])*w); 2073 dst[1] = (T)((x*m[4] + y*m[5] + z*m[6] + m[7])*w); 2074 } 2075 else 2076 dst[0] = dst[1] = (T)0; 2077 } 2078 } 2079 else 2080 { 2081 for( i = 0; i < len; i++, src += scn, dst += dcn ) 2082 { 2083 const double* _m = m + dcn*(scn + 1); 2084 double w = _m[scn]; 2085 int j, k; 2086 for( k = 0; k < scn; k++ ) 2087 w += _m[k]*src[k]; 2088 if( fabs(w) > eps ) 2089 { 2090 _m = m; 2091 for( j = 0; j < dcn; j++, _m += scn + 1 ) 2092 { 2093 double s = _m[scn]; 2094 for( k = 0; k < scn; k++ ) 2095 s += _m[k]*src[k]; 2096 dst[j] = (T)(s*w); 2097 } 2098 } 2099 else 2100 for( j = 0; j < dcn; j++ ) 2101 dst[j] = 0; 2102 } 2103 } 2104 } 2105 2106 2107 static void 2108 perspectiveTransform_32f(const float* src, float* dst, const double* m, int len, int scn, int dcn) 2109 { 2110 perspectiveTransform_(src, dst, m, len, scn, dcn); 2111 } 2112 2113 static void 2114 perspectiveTransform_64f(const double* src, double* dst, const double* m, int len, int scn, int dcn) 2115 { 2116 perspectiveTransform_(src, dst, m, len, scn, dcn); 2117 } 2118 2119 } 2120 2121 void cv::perspectiveTransform( InputArray _src, OutputArray _dst, InputArray _mtx ) 2122 { 2123 Mat src = _src.getMat(), m = _mtx.getMat(); 2124 int depth = src.depth(), scn = src.channels(), dcn = m.rows-1; 2125 CV_Assert( scn + 1 == m.cols ); 2126 CV_Assert( depth == CV_32F || depth == CV_64F ); 2127 2128 _dst.create( src.size(), CV_MAKETYPE(depth, dcn) ); 2129 Mat dst = _dst.getMat(); 2130 2131 const int mtype = CV_64F; 2132 AutoBuffer<double> _mbuf; 2133 double* mbuf = _mbuf; 2134 2135 if( !m.isContinuous() || m.type() != mtype ) 2136 { 2137 _mbuf.allocate((dcn+1)*(scn+1)); 2138 Mat tmp(dcn+1, scn+1, mtype, (double*)_mbuf); 2139 m.convertTo(tmp, mtype); 2140 m = tmp; 2141 } 2142 else 2143 mbuf = m.ptr<double>(); 2144 2145 TransformFunc func = depth == CV_32F ? 2146 (TransformFunc)perspectiveTransform_32f : 2147 (TransformFunc)perspectiveTransform_64f; 2148 CV_Assert( func != 0 ); 2149 2150 const Mat* arrays[] = {&src, &dst, 0}; 2151 uchar* ptrs[2]; 2152 NAryMatIterator it(arrays, ptrs); 2153 size_t i, total = it.size; 2154 2155 for( i = 0; i < it.nplanes; i++, ++it ) 2156 func( ptrs[0], ptrs[1], (uchar*)mbuf, (int)total, scn, dcn ); 2157 } 2158 2159 /****************************************************************************************\ 2160 * ScaleAdd * 2161 \****************************************************************************************/ 2162 2163 namespace cv 2164 { 2165 2166 static void scaleAdd_32f(const float* src1, const float* src2, float* dst, 2167 int len, float* _alpha) 2168 { 2169 float alpha = *_alpha; 2170 int i = 0; 2171 #if CV_SSE2 2172 if( USE_SSE2 ) 2173 { 2174 __m128 a4 = _mm_set1_ps(alpha); 2175 if( (((size_t)src1|(size_t)src2|(size_t)dst) & 15) == 0 ) 2176 for( ; i <= len - 8; i += 8 ) 2177 { 2178 __m128 x0, x1, y0, y1, t0, t1; 2179 x0 = _mm_load_ps(src1 + i); x1 = _mm_load_ps(src1 + i + 4); 2180 y0 = _mm_load_ps(src2 + i); y1 = _mm_load_ps(src2 + i + 4); 2181 t0 = _mm_add_ps(_mm_mul_ps(x0, a4), y0); 2182 t1 = _mm_add_ps(_mm_mul_ps(x1, a4), y1); 2183 _mm_store_ps(dst + i, t0); 2184 _mm_store_ps(dst + i + 4, t1); 2185 } 2186 else 2187 for( ; i <= len - 8; i += 8 ) 2188 { 2189 __m128 x0, x1, y0, y1, t0, t1; 2190 x0 = _mm_loadu_ps(src1 + i); x1 = _mm_loadu_ps(src1 + i + 4); 2191 y0 = _mm_loadu_ps(src2 + i); y1 = _mm_loadu_ps(src2 + i + 4); 2192 t0 = _mm_add_ps(_mm_mul_ps(x0, a4), y0); 2193 t1 = _mm_add_ps(_mm_mul_ps(x1, a4), y1); 2194 _mm_storeu_ps(dst + i, t0); 2195 _mm_storeu_ps(dst + i + 4, t1); 2196 } 2197 } 2198 else 2199 #elif CV_NEON 2200 if (true) 2201 { 2202 for ( ; i <= len - 4; i += 4) 2203 { 2204 float32x4_t v_src1 = vld1q_f32(src1 + i), v_src2 = vld1q_f32(src2 + i); 2205 vst1q_f32(dst + i, vaddq_f32(vmulq_n_f32(v_src1, alpha), v_src2)); 2206 } 2207 } 2208 else 2209 #endif 2210 //vz why do we need unroll here? 2211 for( ; i <= len - 4; i += 4 ) 2212 { 2213 float t0, t1; 2214 t0 = src1[i]*alpha + src2[i]; 2215 t1 = src1[i+1]*alpha + src2[i+1]; 2216 dst[i] = t0; dst[i+1] = t1; 2217 t0 = src1[i+2]*alpha + src2[i+2]; 2218 t1 = src1[i+3]*alpha + src2[i+3]; 2219 dst[i+2] = t0; dst[i+3] = t1; 2220 } 2221 for(; i < len; i++ ) 2222 dst[i] = src1[i]*alpha + src2[i]; 2223 } 2224 2225 2226 static void scaleAdd_64f(const double* src1, const double* src2, double* dst, 2227 int len, double* _alpha) 2228 { 2229 double alpha = *_alpha; 2230 int i = 0; 2231 #if CV_SSE2 2232 if( USE_SSE2 && (((size_t)src1|(size_t)src2|(size_t)dst) & 15) == 0 ) 2233 { 2234 __m128d a2 = _mm_set1_pd(alpha); 2235 for( ; i <= len - 4; i += 4 ) 2236 { 2237 __m128d x0, x1, y0, y1, t0, t1; 2238 x0 = _mm_load_pd(src1 + i); x1 = _mm_load_pd(src1 + i + 2); 2239 y0 = _mm_load_pd(src2 + i); y1 = _mm_load_pd(src2 + i + 2); 2240 t0 = _mm_add_pd(_mm_mul_pd(x0, a2), y0); 2241 t1 = _mm_add_pd(_mm_mul_pd(x1, a2), y1); 2242 _mm_store_pd(dst + i, t0); 2243 _mm_store_pd(dst + i + 2, t1); 2244 } 2245 } 2246 else 2247 #endif 2248 //vz why do we need unroll here? 2249 for( ; i <= len - 4; i += 4 ) 2250 { 2251 double t0, t1; 2252 t0 = src1[i]*alpha + src2[i]; 2253 t1 = src1[i+1]*alpha + src2[i+1]; 2254 dst[i] = t0; dst[i+1] = t1; 2255 t0 = src1[i+2]*alpha + src2[i+2]; 2256 t1 = src1[i+3]*alpha + src2[i+3]; 2257 dst[i+2] = t0; dst[i+3] = t1; 2258 } 2259 for(; i < len; i++ ) 2260 dst[i] = src1[i]*alpha + src2[i]; 2261 } 2262 2263 typedef void (*ScaleAddFunc)(const uchar* src1, const uchar* src2, uchar* dst, int len, const void* alpha); 2264 2265 #ifdef HAVE_OPENCL 2266 2267 static bool ocl_scaleAdd( InputArray _src1, double alpha, InputArray _src2, OutputArray _dst, int type ) 2268 { 2269 const ocl::Device & d = ocl::Device::getDefault(); 2270 2271 bool doubleSupport = d.doubleFPConfig() > 0; 2272 Size size = _src1.size(); 2273 int depth = CV_MAT_DEPTH(type); 2274 if ( (!doubleSupport && depth == CV_64F) || size != _src2.size() ) 2275 return false; 2276 2277 _dst.create(size, type); 2278 int cn = CV_MAT_CN(type), wdepth = std::max(depth, CV_32F); 2279 int kercn = ocl::predictOptimalVectorWidthMax(_src1, _src2, _dst), 2280 rowsPerWI = d.isIntel() ? 4 : 1; 2281 2282 char cvt[2][50]; 2283 ocl::Kernel k("KF", ocl::core::arithm_oclsrc, 2284 format("-D OP_SCALE_ADD -D BINARY_OP -D dstT=%s -D workT=%s -D convertToWT1=%s" 2285 " -D srcT1=dstT -D srcT2=dstT -D convertToDT=%s -D workT1=%s" 2286 " -D wdepth=%d%s -D rowsPerWI=%d", 2287 ocl::typeToStr(CV_MAKE_TYPE(depth, kercn)), 2288 ocl::typeToStr(CV_MAKE_TYPE(wdepth, kercn)), 2289 ocl::convertTypeStr(depth, wdepth, kercn, cvt[0]), 2290 ocl::convertTypeStr(wdepth, depth, kercn, cvt[1]), 2291 ocl::typeToStr(wdepth), wdepth, 2292 doubleSupport ? " -D DOUBLE_SUPPORT" : "", rowsPerWI)); 2293 if (k.empty()) 2294 return false; 2295 2296 UMat src1 = _src1.getUMat(), src2 = _src2.getUMat(), dst = _dst.getUMat(); 2297 2298 ocl::KernelArg src1arg = ocl::KernelArg::ReadOnlyNoSize(src1), 2299 src2arg = ocl::KernelArg::ReadOnlyNoSize(src2), 2300 dstarg = ocl::KernelArg::WriteOnly(dst, cn, kercn); 2301 2302 if (wdepth == CV_32F) 2303 k.args(src1arg, src2arg, dstarg, (float)alpha); 2304 else 2305 k.args(src1arg, src2arg, dstarg, alpha); 2306 2307 size_t globalsize[2] = { dst.cols * cn / kercn, (dst.rows + rowsPerWI - 1) / rowsPerWI }; 2308 return k.run(2, globalsize, NULL, false); 2309 } 2310 2311 #endif 2312 2313 } 2314 2315 void cv::scaleAdd( InputArray _src1, double alpha, InputArray _src2, OutputArray _dst ) 2316 { 2317 int type = _src1.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type); 2318 CV_Assert( type == _src2.type() ); 2319 2320 CV_OCL_RUN(_src1.dims() <= 2 && _src2.dims() <= 2 && _dst.isUMat(), 2321 ocl_scaleAdd(_src1, alpha, _src2, _dst, type)) 2322 2323 if( depth < CV_32F ) 2324 { 2325 addWeighted(_src1, alpha, _src2, 1, 0, _dst, depth); 2326 return; 2327 } 2328 2329 Mat src1 = _src1.getMat(), src2 = _src2.getMat(); 2330 CV_Assert(src1.size == src2.size); 2331 2332 _dst.create(src1.dims, src1.size, type); 2333 Mat dst = _dst.getMat(); 2334 2335 float falpha = (float)alpha; 2336 void* palpha = depth == CV_32F ? (void*)&falpha : (void*)α 2337 2338 ScaleAddFunc func = depth == CV_32F ? (ScaleAddFunc)scaleAdd_32f : (ScaleAddFunc)scaleAdd_64f; 2339 2340 if (src1.isContinuous() && src2.isContinuous() && dst.isContinuous()) 2341 { 2342 size_t len = src1.total()*cn; 2343 func(src1.ptr(), src2.ptr(), dst.ptr(), (int)len, palpha); 2344 return; 2345 } 2346 2347 const Mat* arrays[] = {&src1, &src2, &dst, 0}; 2348 uchar* ptrs[3]; 2349 NAryMatIterator it(arrays, ptrs); 2350 size_t i, len = it.size*cn; 2351 2352 for( i = 0; i < it.nplanes; i++, ++it ) 2353 func( ptrs[0], ptrs[1], ptrs[2], (int)len, palpha ); 2354 } 2355 2356 /****************************************************************************************\ 2357 * Covariation Matrix * 2358 \****************************************************************************************/ 2359 2360 void cv::calcCovarMatrix( const Mat* data, int nsamples, Mat& covar, Mat& _mean, int flags, int ctype ) 2361 { 2362 CV_Assert( data && nsamples > 0 ); 2363 Size size = data[0].size(); 2364 int sz = size.width * size.height, esz = (int)data[0].elemSize(); 2365 int type = data[0].type(); 2366 Mat mean; 2367 ctype = std::max(std::max(CV_MAT_DEPTH(ctype >= 0 ? ctype : type), _mean.depth()), CV_32F); 2368 2369 if( (flags & CV_COVAR_USE_AVG) != 0 ) 2370 { 2371 CV_Assert( _mean.size() == size ); 2372 if( _mean.isContinuous() && _mean.type() == ctype ) 2373 mean = _mean.reshape(1, 1); 2374 else 2375 { 2376 _mean.convertTo(mean, ctype); 2377 mean = mean.reshape(1, 1); 2378 } 2379 } 2380 2381 Mat _data(nsamples, sz, type); 2382 2383 for( int i = 0; i < nsamples; i++ ) 2384 { 2385 CV_Assert( data[i].size() == size && data[i].type() == type ); 2386 if( data[i].isContinuous() ) 2387 memcpy( _data.ptr(i), data[i].ptr(), sz*esz ); 2388 else 2389 { 2390 Mat dataRow(size.height, size.width, type, _data.ptr(i)); 2391 data[i].copyTo(dataRow); 2392 } 2393 } 2394 2395 calcCovarMatrix( _data, covar, mean, (flags & ~(CV_COVAR_ROWS|CV_COVAR_COLS)) | CV_COVAR_ROWS, ctype ); 2396 if( (flags & CV_COVAR_USE_AVG) == 0 ) 2397 _mean = mean.reshape(1, size.height); 2398 } 2399 2400 void cv::calcCovarMatrix( InputArray _src, OutputArray _covar, InputOutputArray _mean, int flags, int ctype ) 2401 { 2402 if(_src.kind() == _InputArray::STD_VECTOR_MAT) 2403 { 2404 std::vector<cv::Mat> src; 2405 _src.getMatVector(src); 2406 2407 CV_Assert( src.size() > 0 ); 2408 2409 Size size = src[0].size(); 2410 int type = src[0].type(); 2411 2412 ctype = std::max(std::max(CV_MAT_DEPTH(ctype >= 0 ? ctype : type), _mean.depth()), CV_32F); 2413 2414 Mat _data(static_cast<int>(src.size()), size.area(), type); 2415 2416 int i = 0; 2417 for(std::vector<cv::Mat>::iterator each = src.begin(); each != src.end(); each++, i++ ) 2418 { 2419 CV_Assert( (*each).size() == size && (*each).type() == type ); 2420 Mat dataRow(size.height, size.width, type, _data.ptr(i)); 2421 (*each).copyTo(dataRow); 2422 } 2423 2424 Mat mean; 2425 if( (flags & CV_COVAR_USE_AVG) != 0 ) 2426 { 2427 CV_Assert( _mean.size() == size ); 2428 2429 if( mean.type() != ctype ) 2430 { 2431 mean = _mean.getMat(); 2432 _mean.create(mean.size(), ctype); 2433 Mat tmp = _mean.getMat(); 2434 mean.convertTo(tmp, ctype); 2435 mean = tmp; 2436 } 2437 2438 mean = _mean.getMat().reshape(1, 1); 2439 } 2440 2441 calcCovarMatrix( _data, _covar, mean, (flags & ~(CV_COVAR_ROWS|CV_COVAR_COLS)) | CV_COVAR_ROWS, ctype ); 2442 2443 if( (flags & CV_COVAR_USE_AVG) == 0 ) 2444 { 2445 mean = mean.reshape(1, size.height); 2446 mean.copyTo(_mean); 2447 } 2448 return; 2449 } 2450 2451 Mat data = _src.getMat(), mean; 2452 CV_Assert( ((flags & CV_COVAR_ROWS) != 0) ^ ((flags & CV_COVAR_COLS) != 0) ); 2453 bool takeRows = (flags & CV_COVAR_ROWS) != 0; 2454 int type = data.type(); 2455 int nsamples = takeRows ? data.rows : data.cols; 2456 CV_Assert( nsamples > 0 ); 2457 Size size = takeRows ? Size(data.cols, 1) : Size(1, data.rows); 2458 2459 if( (flags & CV_COVAR_USE_AVG) != 0 ) 2460 { 2461 mean = _mean.getMat(); 2462 ctype = std::max(std::max(CV_MAT_DEPTH(ctype >= 0 ? ctype : type), mean.depth()), CV_32F); 2463 CV_Assert( mean.size() == size ); 2464 if( mean.type() != ctype ) 2465 { 2466 _mean.create(mean.size(), ctype); 2467 Mat tmp = _mean.getMat(); 2468 mean.convertTo(tmp, ctype); 2469 mean = tmp; 2470 } 2471 } 2472 else 2473 { 2474 ctype = std::max(CV_MAT_DEPTH(ctype >= 0 ? ctype : type), CV_32F); 2475 reduce( _src, _mean, takeRows ? 0 : 1, CV_REDUCE_AVG, ctype ); 2476 mean = _mean.getMat(); 2477 } 2478 2479 mulTransposed( data, _covar, ((flags & CV_COVAR_NORMAL) == 0) ^ takeRows, 2480 mean, (flags & CV_COVAR_SCALE) != 0 ? 1./nsamples : 1, ctype ); 2481 } 2482 2483 /****************************************************************************************\ 2484 * Mahalanobis * 2485 \****************************************************************************************/ 2486 2487 double cv::Mahalanobis( InputArray _v1, InputArray _v2, InputArray _icovar ) 2488 { 2489 Mat v1 = _v1.getMat(), v2 = _v2.getMat(), icovar = _icovar.getMat(); 2490 int type = v1.type(), depth = v1.depth(); 2491 Size sz = v1.size(); 2492 int i, j, len = sz.width*sz.height*v1.channels(); 2493 AutoBuffer<double> buf(len); 2494 double result = 0; 2495 2496 CV_Assert( type == v2.type() && type == icovar.type() && 2497 sz == v2.size() && len == icovar.rows && len == icovar.cols ); 2498 2499 sz.width *= v1.channels(); 2500 if( v1.isContinuous() && v2.isContinuous() ) 2501 { 2502 sz.width *= sz.height; 2503 sz.height = 1; 2504 } 2505 2506 if( depth == CV_32F ) 2507 { 2508 const float* src1 = v1.ptr<float>(); 2509 const float* src2 = v2.ptr<float>(); 2510 size_t step1 = v1.step/sizeof(src1[0]); 2511 size_t step2 = v2.step/sizeof(src2[0]); 2512 double* diff = buf; 2513 const float* mat = icovar.ptr<float>(); 2514 size_t matstep = icovar.step/sizeof(mat[0]); 2515 2516 for( ; sz.height--; src1 += step1, src2 += step2, diff += sz.width ) 2517 { 2518 for( i = 0; i < sz.width; i++ ) 2519 diff[i] = src1[i] - src2[i]; 2520 } 2521 2522 diff = buf; 2523 for( i = 0; i < len; i++, mat += matstep ) 2524 { 2525 double row_sum = 0; 2526 j = 0; 2527 #if CV_ENABLE_UNROLLED 2528 for(; j <= len - 4; j += 4 ) 2529 row_sum += diff[j]*mat[j] + diff[j+1]*mat[j+1] + 2530 diff[j+2]*mat[j+2] + diff[j+3]*mat[j+3]; 2531 #endif 2532 for( ; j < len; j++ ) 2533 row_sum += diff[j]*mat[j]; 2534 result += row_sum * diff[i]; 2535 } 2536 } 2537 else if( depth == CV_64F ) 2538 { 2539 const double* src1 = v1.ptr<double>(); 2540 const double* src2 = v2.ptr<double>(); 2541 size_t step1 = v1.step/sizeof(src1[0]); 2542 size_t step2 = v2.step/sizeof(src2[0]); 2543 double* diff = buf; 2544 const double* mat = icovar.ptr<double>(); 2545 size_t matstep = icovar.step/sizeof(mat[0]); 2546 2547 for( ; sz.height--; src1 += step1, src2 += step2, diff += sz.width ) 2548 { 2549 for( i = 0; i < sz.width; i++ ) 2550 diff[i] = src1[i] - src2[i]; 2551 } 2552 2553 diff = buf; 2554 for( i = 0; i < len; i++, mat += matstep ) 2555 { 2556 double row_sum = 0; 2557 j = 0; 2558 #if CV_ENABLE_UNROLLED 2559 for(; j <= len - 4; j += 4 ) 2560 row_sum += diff[j]*mat[j] + diff[j+1]*mat[j+1] + 2561 diff[j+2]*mat[j+2] + diff[j+3]*mat[j+3]; 2562 #endif 2563 for( ; j < len; j++ ) 2564 row_sum += diff[j]*mat[j]; 2565 result += row_sum * diff[i]; 2566 } 2567 } 2568 else 2569 CV_Error( CV_StsUnsupportedFormat, "" ); 2570 2571 return std::sqrt(result); 2572 } 2573 2574 /****************************************************************************************\ 2575 * MulTransposed * 2576 \****************************************************************************************/ 2577 2578 namespace cv 2579 { 2580 2581 template<typename sT, typename dT> static void 2582 MulTransposedR( const Mat& srcmat, Mat& dstmat, const Mat& deltamat, double scale ) 2583 { 2584 int i, j, k; 2585 const sT* src = srcmat.ptr<sT>(); 2586 dT* dst = dstmat.ptr<dT>(); 2587 const dT* delta = deltamat.ptr<dT>(); 2588 size_t srcstep = srcmat.step/sizeof(src[0]); 2589 size_t dststep = dstmat.step/sizeof(dst[0]); 2590 size_t deltastep = deltamat.rows > 1 ? deltamat.step/sizeof(delta[0]) : 0; 2591 int delta_cols = deltamat.cols; 2592 Size size = srcmat.size(); 2593 dT* tdst = dst; 2594 dT* col_buf = 0; 2595 dT* delta_buf = 0; 2596 int buf_size = size.height*sizeof(dT); 2597 AutoBuffer<uchar> buf; 2598 2599 if( delta && delta_cols < size.width ) 2600 { 2601 assert( delta_cols == 1 ); 2602 buf_size *= 5; 2603 } 2604 buf.allocate(buf_size); 2605 col_buf = (dT*)(uchar*)buf; 2606 2607 if( delta && delta_cols < size.width ) 2608 { 2609 delta_buf = col_buf + size.height; 2610 for( i = 0; i < size.height; i++ ) 2611 delta_buf[i*4] = delta_buf[i*4+1] = 2612 delta_buf[i*4+2] = delta_buf[i*4+3] = delta[i*deltastep]; 2613 delta = delta_buf; 2614 deltastep = deltastep ? 4 : 0; 2615 } 2616 2617 if( !delta ) 2618 for( i = 0; i < size.width; i++, tdst += dststep ) 2619 { 2620 for( k = 0; k < size.height; k++ ) 2621 col_buf[k] = src[k*srcstep+i]; 2622 2623 for( j = i; j <= size.width - 4; j += 4 ) 2624 { 2625 double s0 = 0, s1 = 0, s2 = 0, s3 = 0; 2626 const sT *tsrc = src + j; 2627 2628 for( k = 0; k < size.height; k++, tsrc += srcstep ) 2629 { 2630 double a = col_buf[k]; 2631 s0 += a * tsrc[0]; 2632 s1 += a * tsrc[1]; 2633 s2 += a * tsrc[2]; 2634 s3 += a * tsrc[3]; 2635 } 2636 2637 tdst[j] = (dT)(s0*scale); 2638 tdst[j+1] = (dT)(s1*scale); 2639 tdst[j+2] = (dT)(s2*scale); 2640 tdst[j+3] = (dT)(s3*scale); 2641 } 2642 2643 for( ; j < size.width; j++ ) 2644 { 2645 double s0 = 0; 2646 const sT *tsrc = src + j; 2647 2648 for( k = 0; k < size.height; k++, tsrc += srcstep ) 2649 s0 += (double)col_buf[k] * tsrc[0]; 2650 2651 tdst[j] = (dT)(s0*scale); 2652 } 2653 } 2654 else 2655 for( i = 0; i < size.width; i++, tdst += dststep ) 2656 { 2657 if( !delta_buf ) 2658 for( k = 0; k < size.height; k++ ) 2659 col_buf[k] = src[k*srcstep+i] - delta[k*deltastep+i]; 2660 else 2661 for( k = 0; k < size.height; k++ ) 2662 col_buf[k] = src[k*srcstep+i] - delta_buf[k*deltastep]; 2663 2664 for( j = i; j <= size.width - 4; j += 4 ) 2665 { 2666 double s0 = 0, s1 = 0, s2 = 0, s3 = 0; 2667 const sT *tsrc = src + j; 2668 const dT *d = delta_buf ? delta_buf : delta + j; 2669 2670 for( k = 0; k < size.height; k++, tsrc+=srcstep, d+=deltastep ) 2671 { 2672 double a = col_buf[k]; 2673 s0 += a * (tsrc[0] - d[0]); 2674 s1 += a * (tsrc[1] - d[1]); 2675 s2 += a * (tsrc[2] - d[2]); 2676 s3 += a * (tsrc[3] - d[3]); 2677 } 2678 2679 tdst[j] = (dT)(s0*scale); 2680 tdst[j+1] = (dT)(s1*scale); 2681 tdst[j+2] = (dT)(s2*scale); 2682 tdst[j+3] = (dT)(s3*scale); 2683 } 2684 2685 for( ; j < size.width; j++ ) 2686 { 2687 double s0 = 0; 2688 const sT *tsrc = src + j; 2689 const dT *d = delta_buf ? delta_buf : delta + j; 2690 2691 for( k = 0; k < size.height; k++, tsrc+=srcstep, d+=deltastep ) 2692 s0 += (double)col_buf[k] * (tsrc[0] - d[0]); 2693 2694 tdst[j] = (dT)(s0*scale); 2695 } 2696 } 2697 } 2698 2699 2700 template<typename sT, typename dT> static void 2701 MulTransposedL( const Mat& srcmat, Mat& dstmat, const Mat& deltamat, double scale ) 2702 { 2703 int i, j, k; 2704 const sT* src = srcmat.ptr<sT>(); 2705 dT* dst = dstmat.ptr<dT>(); 2706 const dT* delta = deltamat.ptr<dT>(); 2707 size_t srcstep = srcmat.step/sizeof(src[0]); 2708 size_t dststep = dstmat.step/sizeof(dst[0]); 2709 size_t deltastep = deltamat.rows > 1 ? deltamat.step/sizeof(delta[0]) : 0; 2710 int delta_cols = deltamat.cols; 2711 Size size = srcmat.size(); 2712 dT* tdst = dst; 2713 2714 if( !delta ) 2715 for( i = 0; i < size.height; i++, tdst += dststep ) 2716 for( j = i; j < size.height; j++ ) 2717 { 2718 double s = 0; 2719 const sT *tsrc1 = src + i*srcstep; 2720 const sT *tsrc2 = src + j*srcstep; 2721 2722 for( k = 0; k <= size.width - 4; k += 4 ) 2723 s += (double)tsrc1[k]*tsrc2[k] + (double)tsrc1[k+1]*tsrc2[k+1] + 2724 (double)tsrc1[k+2]*tsrc2[k+2] + (double)tsrc1[k+3]*tsrc2[k+3]; 2725 for( ; k < size.width; k++ ) 2726 s += (double)tsrc1[k] * tsrc2[k]; 2727 tdst[j] = (dT)(s*scale); 2728 } 2729 else 2730 { 2731 dT delta_buf[4]; 2732 int delta_shift = delta_cols == size.width ? 4 : 0; 2733 AutoBuffer<uchar> buf(size.width*sizeof(dT)); 2734 dT* row_buf = (dT*)(uchar*)buf; 2735 2736 for( i = 0; i < size.height; i++, tdst += dststep ) 2737 { 2738 const sT *tsrc1 = src + i*srcstep; 2739 const dT *tdelta1 = delta + i*deltastep; 2740 2741 if( delta_cols < size.width ) 2742 for( k = 0; k < size.width; k++ ) 2743 row_buf[k] = tsrc1[k] - tdelta1[0]; 2744 else 2745 for( k = 0; k < size.width; k++ ) 2746 row_buf[k] = tsrc1[k] - tdelta1[k]; 2747 2748 for( j = i; j < size.height; j++ ) 2749 { 2750 double s = 0; 2751 const sT *tsrc2 = src + j*srcstep; 2752 const dT *tdelta2 = delta + j*deltastep; 2753 if( delta_cols < size.width ) 2754 { 2755 delta_buf[0] = delta_buf[1] = 2756 delta_buf[2] = delta_buf[3] = tdelta2[0]; 2757 tdelta2 = delta_buf; 2758 } 2759 for( k = 0; k <= size.width-4; k += 4, tdelta2 += delta_shift ) 2760 s += (double)row_buf[k]*(tsrc2[k] - tdelta2[0]) + 2761 (double)row_buf[k+1]*(tsrc2[k+1] - tdelta2[1]) + 2762 (double)row_buf[k+2]*(tsrc2[k+2] - tdelta2[2]) + 2763 (double)row_buf[k+3]*(tsrc2[k+3] - tdelta2[3]); 2764 for( ; k < size.width; k++, tdelta2++ ) 2765 s += (double)row_buf[k]*(tsrc2[k] - tdelta2[0]); 2766 tdst[j] = (dT)(s*scale); 2767 } 2768 } 2769 } 2770 } 2771 2772 typedef void (*MulTransposedFunc)(const Mat& src, Mat& dst, const Mat& delta, double scale); 2773 2774 } 2775 2776 void cv::mulTransposed( InputArray _src, OutputArray _dst, bool ata, 2777 InputArray _delta, double scale, int dtype ) 2778 { 2779 Mat src = _src.getMat(), delta = _delta.getMat(); 2780 const int gemm_level = 100; // boundary above which GEMM is faster. 2781 int stype = src.type(); 2782 dtype = std::max(std::max(CV_MAT_DEPTH(dtype >= 0 ? dtype : stype), delta.depth()), CV_32F); 2783 CV_Assert( src.channels() == 1 ); 2784 2785 if( !delta.empty() ) 2786 { 2787 CV_Assert( delta.channels() == 1 && 2788 (delta.rows == src.rows || delta.rows == 1) && 2789 (delta.cols == src.cols || delta.cols == 1)); 2790 if( delta.type() != dtype ) 2791 delta.convertTo(delta, dtype); 2792 } 2793 2794 int dsize = ata ? src.cols : src.rows; 2795 _dst.create( dsize, dsize, dtype ); 2796 Mat dst = _dst.getMat(); 2797 2798 if( src.data == dst.data || (stype == dtype && 2799 (dst.cols >= gemm_level && dst.rows >= gemm_level && 2800 src.cols >= gemm_level && src.rows >= gemm_level))) 2801 { 2802 Mat src2; 2803 const Mat* tsrc = &src; 2804 if( !delta.empty() ) 2805 { 2806 if( delta.size() == src.size() ) 2807 subtract( src, delta, src2 ); 2808 else 2809 { 2810 repeat(delta, src.rows/delta.rows, src.cols/delta.cols, src2); 2811 subtract( src, src2, src2 ); 2812 } 2813 tsrc = &src2; 2814 } 2815 gemm( *tsrc, *tsrc, scale, Mat(), 0, dst, ata ? GEMM_1_T : GEMM_2_T ); 2816 } 2817 else 2818 { 2819 MulTransposedFunc func = 0; 2820 if(stype == CV_8U && dtype == CV_32F) 2821 { 2822 if(ata) 2823 func = MulTransposedR<uchar,float>; 2824 else 2825 func = MulTransposedL<uchar,float>; 2826 } 2827 else if(stype == CV_8U && dtype == CV_64F) 2828 { 2829 if(ata) 2830 func = MulTransposedR<uchar,double>; 2831 else 2832 func = MulTransposedL<uchar,double>; 2833 } 2834 else if(stype == CV_16U && dtype == CV_32F) 2835 { 2836 if(ata) 2837 func = MulTransposedR<ushort,float>; 2838 else 2839 func = MulTransposedL<ushort,float>; 2840 } 2841 else if(stype == CV_16U && dtype == CV_64F) 2842 { 2843 if(ata) 2844 func = MulTransposedR<ushort,double>; 2845 else 2846 func = MulTransposedL<ushort,double>; 2847 } 2848 else if(stype == CV_16S && dtype == CV_32F) 2849 { 2850 if(ata) 2851 func = MulTransposedR<short,float>; 2852 else 2853 func = MulTransposedL<short,float>; 2854 } 2855 else if(stype == CV_16S && dtype == CV_64F) 2856 { 2857 if(ata) 2858 func = MulTransposedR<short,double>; 2859 else 2860 func = MulTransposedL<short,double>; 2861 } 2862 else if(stype == CV_32F && dtype == CV_32F) 2863 { 2864 if(ata) 2865 func = MulTransposedR<float,float>; 2866 else 2867 func = MulTransposedL<float,float>; 2868 } 2869 else if(stype == CV_32F && dtype == CV_64F) 2870 { 2871 if(ata) 2872 func = MulTransposedR<float,double>; 2873 else 2874 func = MulTransposedL<float,double>; 2875 } 2876 else if(stype == CV_64F && dtype == CV_64F) 2877 { 2878 if(ata) 2879 func = MulTransposedR<double,double>; 2880 else 2881 func = MulTransposedL<double,double>; 2882 } 2883 if( !func ) 2884 CV_Error( CV_StsUnsupportedFormat, "" ); 2885 2886 func( src, dst, delta, scale ); 2887 completeSymm( dst, false ); 2888 } 2889 } 2890 2891 /****************************************************************************************\ 2892 * Dot Product * 2893 \****************************************************************************************/ 2894 2895 namespace cv 2896 { 2897 2898 template<typename T> double 2899 dotProd_(const T* src1, const T* src2, int len) 2900 { 2901 int i = 0; 2902 double result = 0; 2903 2904 #if CV_ENABLE_UNROLLED 2905 for( ; i <= len - 4; i += 4 ) 2906 result += (double)src1[i]*src2[i] + (double)src1[i+1]*src2[i+1] + 2907 (double)src1[i+2]*src2[i+2] + (double)src1[i+3]*src2[i+3]; 2908 #endif 2909 for( ; i < len; i++ ) 2910 result += (double)src1[i]*src2[i]; 2911 2912 return result; 2913 } 2914 2915 2916 static double dotProd_8u(const uchar* src1, const uchar* src2, int len) 2917 { 2918 double r = 0; 2919 #if ARITHM_USE_IPP && 0 2920 CV_IPP_CHECK() 2921 { 2922 if (0 <= ippiDotProd_8u64f_C1R(src1, (int)(len*sizeof(src1[0])), 2923 src2, (int)(len*sizeof(src2[0])), 2924 ippiSize(len, 1), &r)) 2925 { 2926 CV_IMPL_ADD(CV_IMPL_IPP); 2927 return r; 2928 } 2929 setIppErrorStatus(); 2930 } 2931 #endif 2932 int i = 0; 2933 2934 #if CV_SSE2 2935 if( USE_SSE2 ) 2936 { 2937 int j, len0 = len & -4, blockSize0 = (1 << 13), blockSize; 2938 __m128i z = _mm_setzero_si128(); 2939 CV_DECL_ALIGNED(16) int buf[4]; 2940 2941 while( i < len0 ) 2942 { 2943 blockSize = std::min(len0 - i, blockSize0); 2944 __m128i s = z; 2945 j = 0; 2946 for( ; j <= blockSize - 16; j += 16 ) 2947 { 2948 __m128i b0 = _mm_loadu_si128((const __m128i*)(src1 + j)); 2949 __m128i b1 = _mm_loadu_si128((const __m128i*)(src2 + j)); 2950 __m128i s0, s1, s2, s3; 2951 s0 = _mm_unpacklo_epi8(b0, z); 2952 s2 = _mm_unpackhi_epi8(b0, z); 2953 s1 = _mm_unpacklo_epi8(b1, z); 2954 s3 = _mm_unpackhi_epi8(b1, z); 2955 s0 = _mm_madd_epi16(s0, s1); 2956 s2 = _mm_madd_epi16(s2, s3); 2957 s = _mm_add_epi32(s, s0); 2958 s = _mm_add_epi32(s, s2); 2959 } 2960 2961 for( ; j < blockSize; j += 4 ) 2962 { 2963 __m128i s0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src1 + j)), z); 2964 __m128i s1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src2 + j)), z); 2965 s0 = _mm_madd_epi16(s0, s1); 2966 s = _mm_add_epi32(s, s0); 2967 } 2968 2969 _mm_store_si128((__m128i*)buf, s); 2970 r += buf[0] + buf[1] + buf[2] + buf[3]; 2971 2972 src1 += blockSize; 2973 src2 += blockSize; 2974 i += blockSize; 2975 } 2976 } 2977 #elif CV_NEON 2978 int len0 = len & -8, blockSize0 = (1 << 15), blockSize; 2979 uint32x4_t v_zero = vdupq_n_u32(0u); 2980 CV_DECL_ALIGNED(16) uint buf[4]; 2981 2982 while( i < len0 ) 2983 { 2984 blockSize = std::min(len0 - i, blockSize0); 2985 uint32x4_t v_sum = v_zero; 2986 2987 int j = 0; 2988 for( ; j <= blockSize - 16; j += 16 ) 2989 { 2990 uint8x16_t v_src1 = vld1q_u8(src1 + j), v_src2 = vld1q_u8(src2 + j); 2991 2992 uint16x8_t v_src10 = vmovl_u8(vget_low_u8(v_src1)), v_src20 = vmovl_u8(vget_low_u8(v_src2)); 2993 v_sum = vmlal_u16(v_sum, vget_low_u16(v_src10), vget_low_u16(v_src20)); 2994 v_sum = vmlal_u16(v_sum, vget_high_u16(v_src10), vget_high_u16(v_src20)); 2995 2996 v_src10 = vmovl_u8(vget_high_u8(v_src1)); 2997 v_src20 = vmovl_u8(vget_high_u8(v_src2)); 2998 v_sum = vmlal_u16(v_sum, vget_low_u16(v_src10), vget_low_u16(v_src20)); 2999 v_sum = vmlal_u16(v_sum, vget_high_u16(v_src10), vget_high_u16(v_src20)); 3000 } 3001 3002 for( ; j <= blockSize - 8; j += 8 ) 3003 { 3004 uint16x8_t v_src1 = vmovl_u8(vld1_u8(src1 + j)), v_src2 = vmovl_u8(vld1_u8(src2 + j)); 3005 v_sum = vmlal_u16(v_sum, vget_low_u16(v_src1), vget_low_u16(v_src2)); 3006 v_sum = vmlal_u16(v_sum, vget_high_u16(v_src1), vget_high_u16(v_src2)); 3007 } 3008 3009 vst1q_u32(buf, v_sum); 3010 r += buf[0] + buf[1] + buf[2] + buf[3]; 3011 3012 src1 += blockSize; 3013 src2 += blockSize; 3014 i += blockSize; 3015 } 3016 #endif 3017 return r + dotProd_(src1, src2, len - i); 3018 } 3019 3020 3021 static double dotProd_8s(const schar* src1, const schar* src2, int len) 3022 { 3023 int i = 0; 3024 double r = 0.0; 3025 3026 #if CV_SSE2 3027 if( USE_SSE2 ) 3028 { 3029 int j, len0 = len & -4, blockSize0 = (1 << 13), blockSize; 3030 __m128i z = _mm_setzero_si128(); 3031 CV_DECL_ALIGNED(16) int buf[4]; 3032 3033 while( i < len0 ) 3034 { 3035 blockSize = std::min(len0 - i, blockSize0); 3036 __m128i s = z; 3037 j = 0; 3038 for( ; j <= blockSize - 16; j += 16 ) 3039 { 3040 __m128i b0 = _mm_loadu_si128((const __m128i*)(src1 + j)); 3041 __m128i b1 = _mm_loadu_si128((const __m128i*)(src2 + j)); 3042 __m128i s0, s1, s2, s3; 3043 s0 = _mm_srai_epi16(_mm_unpacklo_epi8(b0, b0), 8); 3044 s2 = _mm_srai_epi16(_mm_unpackhi_epi8(b0, b0), 8); 3045 s1 = _mm_srai_epi16(_mm_unpacklo_epi8(b1, b1), 8); 3046 s3 = _mm_srai_epi16(_mm_unpackhi_epi8(b1, b1), 8); 3047 s0 = _mm_madd_epi16(s0, s1); 3048 s2 = _mm_madd_epi16(s2, s3); 3049 s = _mm_add_epi32(s, s0); 3050 s = _mm_add_epi32(s, s2); 3051 } 3052 3053 for( ; j < blockSize; j += 4 ) 3054 { 3055 __m128i s0 = _mm_cvtsi32_si128(*(const int*)(src1 + j)); 3056 __m128i s1 = _mm_cvtsi32_si128(*(const int*)(src2 + j)); 3057 s0 = _mm_srai_epi16(_mm_unpacklo_epi8(s0, s0), 8); 3058 s1 = _mm_srai_epi16(_mm_unpacklo_epi8(s1, s1), 8); 3059 s0 = _mm_madd_epi16(s0, s1); 3060 s = _mm_add_epi32(s, s0); 3061 } 3062 3063 _mm_store_si128((__m128i*)buf, s); 3064 r += buf[0] + buf[1] + buf[2] + buf[3]; 3065 3066 src1 += blockSize; 3067 src2 += blockSize; 3068 i += blockSize; 3069 } 3070 } 3071 #elif CV_NEON 3072 int len0 = len & -8, blockSize0 = (1 << 14), blockSize; 3073 int32x4_t v_zero = vdupq_n_s32(0); 3074 CV_DECL_ALIGNED(16) int buf[4]; 3075 3076 while( i < len0 ) 3077 { 3078 blockSize = std::min(len0 - i, blockSize0); 3079 int32x4_t v_sum = v_zero; 3080 3081 int j = 0; 3082 for( ; j <= blockSize - 16; j += 16 ) 3083 { 3084 int8x16_t v_src1 = vld1q_s8(src1 + j), v_src2 = vld1q_s8(src2 + j); 3085 3086 int16x8_t v_src10 = vmovl_s8(vget_low_s8(v_src1)), v_src20 = vmovl_s8(vget_low_s8(v_src2)); 3087 v_sum = vmlal_s16(v_sum, vget_low_s16(v_src10), vget_low_s16(v_src20)); 3088 v_sum = vmlal_s16(v_sum, vget_high_s16(v_src10), vget_high_s16(v_src20)); 3089 3090 v_src10 = vmovl_s8(vget_high_s8(v_src1)); 3091 v_src20 = vmovl_s8(vget_high_s8(v_src2)); 3092 v_sum = vmlal_s16(v_sum, vget_low_s16(v_src10), vget_low_s16(v_src20)); 3093 v_sum = vmlal_s16(v_sum, vget_high_s16(v_src10), vget_high_s16(v_src20)); 3094 } 3095 3096 for( ; j <= blockSize - 8; j += 8 ) 3097 { 3098 int16x8_t v_src1 = vmovl_s8(vld1_s8(src1 + j)), v_src2 = vmovl_s8(vld1_s8(src2 + j)); 3099 v_sum = vmlal_s16(v_sum, vget_low_s16(v_src1), vget_low_s16(v_src2)); 3100 v_sum = vmlal_s16(v_sum, vget_high_s16(v_src1), vget_high_s16(v_src2)); 3101 } 3102 3103 vst1q_s32(buf, v_sum); 3104 r += buf[0] + buf[1] + buf[2] + buf[3]; 3105 3106 src1 += blockSize; 3107 src2 += blockSize; 3108 i += blockSize; 3109 } 3110 #endif 3111 3112 return r + dotProd_(src1, src2, len - i); 3113 } 3114 3115 static double dotProd_16u(const ushort* src1, const ushort* src2, int len) 3116 { 3117 #if (ARITHM_USE_IPP == 1) 3118 CV_IPP_CHECK() 3119 { 3120 double r = 0; 3121 if (0 <= ippiDotProd_16u64f_C1R(src1, (int)(len*sizeof(src1[0])), src2, (int)(len*sizeof(src2[0])), ippiSize(len, 1), &r)) 3122 { 3123 CV_IMPL_ADD(CV_IMPL_IPP); 3124 return r; 3125 } 3126 setIppErrorStatus(); 3127 } 3128 #endif 3129 return dotProd_(src1, src2, len); 3130 } 3131 3132 static double dotProd_16s(const short* src1, const short* src2, int len) 3133 { 3134 #if (ARITHM_USE_IPP == 1) 3135 CV_IPP_CHECK() 3136 { 3137 double r = 0; 3138 if (0 <= ippiDotProd_16s64f_C1R(src1, (int)(len*sizeof(src1[0])), src2, (int)(len*sizeof(src2[0])), ippiSize(len, 1), &r)) 3139 { 3140 CV_IMPL_ADD(CV_IMPL_IPP); 3141 return r; 3142 } 3143 setIppErrorStatus(); 3144 } 3145 #endif 3146 return dotProd_(src1, src2, len); 3147 } 3148 3149 static double dotProd_32s(const int* src1, const int* src2, int len) 3150 { 3151 #if (ARITHM_USE_IPP == 1) 3152 CV_IPP_CHECK() 3153 { 3154 double r = 0; 3155 if (0 <= ippiDotProd_32s64f_C1R(src1, (int)(len*sizeof(src1[0])), src2, (int)(len*sizeof(src2[0])), ippiSize(len, 1), &r)) 3156 { 3157 CV_IMPL_ADD(CV_IMPL_IPP); 3158 return r; 3159 } 3160 setIppErrorStatus(); 3161 } 3162 #endif 3163 return dotProd_(src1, src2, len); 3164 } 3165 3166 static double dotProd_32f(const float* src1, const float* src2, int len) 3167 { 3168 double r = 0.0; 3169 int i = 0; 3170 3171 #if (ARITHM_USE_IPP == 1) 3172 CV_IPP_CHECK() 3173 { 3174 if (0 <= ippsDotProd_32f64f(src1, src2, len, &r)) 3175 { 3176 CV_IMPL_ADD(CV_IMPL_IPP); 3177 return r; 3178 } 3179 setIppErrorStatus(); 3180 } 3181 #elif CV_NEON 3182 int len0 = len & -4, blockSize0 = (1 << 13), blockSize; 3183 float32x4_t v_zero = vdupq_n_f32(0.0f); 3184 CV_DECL_ALIGNED(16) float buf[4]; 3185 3186 while( i < len0 ) 3187 { 3188 blockSize = std::min(len0 - i, blockSize0); 3189 float32x4_t v_sum = v_zero; 3190 3191 int j = 0; 3192 for( ; j <= blockSize - 4; j += 4 ) 3193 v_sum = vmlaq_f32(v_sum, vld1q_f32(src1 + j), vld1q_f32(src2 + j)); 3194 3195 vst1q_f32(buf, v_sum); 3196 r += buf[0] + buf[1] + buf[2] + buf[3]; 3197 3198 src1 += blockSize; 3199 src2 += blockSize; 3200 i += blockSize; 3201 } 3202 #endif 3203 return r + dotProd_(src1, src2, len - i); 3204 } 3205 3206 static double dotProd_64f(const double* src1, const double* src2, int len) 3207 { 3208 #if (ARITHM_USE_IPP == 1) 3209 CV_IPP_CHECK() 3210 { 3211 double r = 0; 3212 if (0 <= ippsDotProd_64f(src1, src2, len, &r)) 3213 { 3214 CV_IMPL_ADD(CV_IMPL_IPP); 3215 return r; 3216 } 3217 setIppErrorStatus(); 3218 } 3219 #endif 3220 return dotProd_(src1, src2, len); 3221 } 3222 3223 3224 typedef double (*DotProdFunc)(const uchar* src1, const uchar* src2, int len); 3225 3226 static DotProdFunc getDotProdFunc(int depth) 3227 { 3228 static DotProdFunc dotProdTab[] = 3229 { 3230 (DotProdFunc)GET_OPTIMIZED(dotProd_8u), (DotProdFunc)GET_OPTIMIZED(dotProd_8s), 3231 (DotProdFunc)dotProd_16u, (DotProdFunc)dotProd_16s, 3232 (DotProdFunc)dotProd_32s, (DotProdFunc)GET_OPTIMIZED(dotProd_32f), 3233 (DotProdFunc)dotProd_64f, 0 3234 }; 3235 3236 return dotProdTab[depth]; 3237 } 3238 3239 double Mat::dot(InputArray _mat) const 3240 { 3241 Mat mat = _mat.getMat(); 3242 int cn = channels(); 3243 DotProdFunc func = getDotProdFunc(depth()); 3244 CV_Assert( mat.type() == type() && mat.size == size && func != 0 ); 3245 3246 if( isContinuous() && mat.isContinuous() ) 3247 { 3248 size_t len = total()*cn; 3249 if( len == (size_t)(int)len ) 3250 return func(data, mat.data, (int)len); 3251 } 3252 3253 const Mat* arrays[] = {this, &mat, 0}; 3254 uchar* ptrs[2]; 3255 NAryMatIterator it(arrays, ptrs); 3256 int len = (int)(it.size*cn); 3257 double r = 0; 3258 3259 for( size_t i = 0; i < it.nplanes; i++, ++it ) 3260 r += func( ptrs[0], ptrs[1], len ); 3261 3262 return r; 3263 } 3264 3265 } 3266 3267 /****************************************************************************************\ 3268 * Earlier API * 3269 \****************************************************************************************/ 3270 3271 CV_IMPL void cvGEMM( const CvArr* Aarr, const CvArr* Barr, double alpha, 3272 const CvArr* Carr, double beta, CvArr* Darr, int flags ) 3273 { 3274 cv::Mat A = cv::cvarrToMat(Aarr), B = cv::cvarrToMat(Barr); 3275 cv::Mat C, D = cv::cvarrToMat(Darr); 3276 3277 if( Carr ) 3278 C = cv::cvarrToMat(Carr); 3279 3280 CV_Assert( (D.rows == ((flags & CV_GEMM_A_T) == 0 ? A.rows : A.cols)) && 3281 (D.cols == ((flags & CV_GEMM_B_T) == 0 ? B.cols : B.rows)) && 3282 D.type() == A.type() ); 3283 3284 gemm( A, B, alpha, C, beta, D, flags ); 3285 } 3286 3287 3288 CV_IMPL void 3289 cvTransform( const CvArr* srcarr, CvArr* dstarr, 3290 const CvMat* transmat, const CvMat* shiftvec ) 3291 { 3292 cv::Mat m = cv::cvarrToMat(transmat), src = cv::cvarrToMat(srcarr), dst = cv::cvarrToMat(dstarr); 3293 3294 if( shiftvec ) 3295 { 3296 cv::Mat v = cv::cvarrToMat(shiftvec).reshape(1,m.rows), 3297 _m(m.rows, m.cols + 1, m.type()), m1 = _m.colRange(0,m.cols), v1 = _m.col(m.cols); 3298 m.convertTo(m1, m1.type()); 3299 v.convertTo(v1, v1.type()); 3300 m = _m; 3301 } 3302 3303 CV_Assert( dst.depth() == src.depth() && dst.channels() == m.rows ); 3304 cv::transform( src, dst, m ); 3305 } 3306 3307 3308 CV_IMPL void 3309 cvPerspectiveTransform( const CvArr* srcarr, CvArr* dstarr, const CvMat* mat ) 3310 { 3311 cv::Mat m = cv::cvarrToMat(mat), src = cv::cvarrToMat(srcarr), dst = cv::cvarrToMat(dstarr); 3312 3313 CV_Assert( dst.type() == src.type() && dst.channels() == m.rows-1 ); 3314 cv::perspectiveTransform( src, dst, m ); 3315 } 3316 3317 3318 CV_IMPL void cvScaleAdd( const CvArr* srcarr1, CvScalar scale, 3319 const CvArr* srcarr2, CvArr* dstarr ) 3320 { 3321 cv::Mat src1 = cv::cvarrToMat(srcarr1), dst = cv::cvarrToMat(dstarr); 3322 3323 CV_Assert( src1.size == dst.size && src1.type() == dst.type() ); 3324 cv::scaleAdd( src1, scale.val[0], cv::cvarrToMat(srcarr2), dst ); 3325 } 3326 3327 3328 CV_IMPL void 3329 cvCalcCovarMatrix( const CvArr** vecarr, int count, 3330 CvArr* covarr, CvArr* avgarr, int flags ) 3331 { 3332 cv::Mat cov0 = cv::cvarrToMat(covarr), cov = cov0, mean0, mean; 3333 CV_Assert( vecarr != 0 && count >= 1 ); 3334 3335 if( avgarr ) 3336 mean = mean0 = cv::cvarrToMat(avgarr); 3337 3338 if( (flags & CV_COVAR_COLS) != 0 || (flags & CV_COVAR_ROWS) != 0 ) 3339 { 3340 3341 cv::Mat data = cv::cvarrToMat(vecarr[0]); 3342 cv::calcCovarMatrix( data, cov, mean, flags, cov.type() ); 3343 } 3344 else 3345 { 3346 std::vector<cv::Mat> data(count); 3347 for( int i = 0; i < count; i++ ) 3348 data[i] = cv::cvarrToMat(vecarr[i]); 3349 cv::calcCovarMatrix( &data[0], count, cov, mean, flags, cov.type() ); 3350 } 3351 3352 if( mean.data != mean0.data && mean0.data ) 3353 mean.convertTo(mean0, mean0.type()); 3354 3355 if( cov.data != cov0.data ) 3356 cov.convertTo(cov0, cov0.type()); 3357 } 3358 3359 3360 CV_IMPL double 3361 cvMahalanobis( const CvArr* srcAarr, const CvArr* srcBarr, const CvArr* matarr ) 3362 { 3363 return cv::Mahalanobis(cv::cvarrToMat(srcAarr), 3364 cv::cvarrToMat(srcBarr), cv::cvarrToMat(matarr)); 3365 } 3366 3367 CV_IMPL void 3368 cvMulTransposed( const CvArr* srcarr, CvArr* dstarr, 3369 int order, const CvArr* deltaarr, double scale ) 3370 { 3371 cv::Mat src = cv::cvarrToMat(srcarr), dst0 = cv::cvarrToMat(dstarr), dst = dst0, delta; 3372 if( deltaarr ) 3373 delta = cv::cvarrToMat(deltaarr); 3374 cv::mulTransposed( src, dst, order != 0, delta, scale, dst.type()); 3375 if( dst.data != dst0.data ) 3376 dst.convertTo(dst0, dst0.type()); 3377 } 3378 3379 CV_IMPL double cvDotProduct( const CvArr* srcAarr, const CvArr* srcBarr ) 3380 { 3381 return cv::cvarrToMat(srcAarr).dot(cv::cvarrToMat(srcBarr)); 3382 } 3383 3384 3385 CV_IMPL void 3386 cvCalcPCA( const CvArr* data_arr, CvArr* avg_arr, CvArr* eigenvals, CvArr* eigenvects, int flags ) 3387 { 3388 cv::Mat data = cv::cvarrToMat(data_arr), mean0 = cv::cvarrToMat(avg_arr); 3389 cv::Mat evals0 = cv::cvarrToMat(eigenvals), evects0 = cv::cvarrToMat(eigenvects); 3390 cv::Mat mean = mean0, evals = evals0, evects = evects0; 3391 3392 cv::PCA pca; 3393 pca.mean = mean; 3394 pca.eigenvalues = evals; 3395 pca.eigenvectors = evects; 3396 3397 pca(data, (flags & CV_PCA_USE_AVG) ? mean : cv::Mat(), 3398 flags, !evals.empty() ? evals.rows + evals.cols - 1 : 0); 3399 3400 if( pca.mean.size() == mean.size() ) 3401 pca.mean.convertTo( mean, mean.type() ); 3402 else 3403 { 3404 cv::Mat temp; pca.mean.convertTo( temp, mean.type() ); 3405 transpose( temp, mean ); 3406 } 3407 3408 evals = pca.eigenvalues; 3409 evects = pca.eigenvectors; 3410 int ecount0 = evals0.cols + evals0.rows - 1; 3411 int ecount = evals.cols + evals.rows - 1; 3412 3413 CV_Assert( (evals0.cols == 1 || evals0.rows == 1) && 3414 ecount0 <= ecount && 3415 evects0.cols == evects.cols && 3416 evects0.rows == ecount0 ); 3417 3418 cv::Mat temp = evals0; 3419 if( evals.rows == 1 ) 3420 evals.colRange(0, ecount0).convertTo(temp, evals0.type()); 3421 else 3422 evals.rowRange(0, ecount0).convertTo(temp, evals0.type()); 3423 if( temp.data != evals0.data ) 3424 transpose(temp, evals0); 3425 evects.rowRange(0, ecount0).convertTo( evects0, evects0.type() ); 3426 3427 // otherwise some datatype's or size's were incorrect, so the output arrays have been reallocated 3428 CV_Assert( mean0.data == mean.data ); 3429 } 3430 3431 3432 CV_IMPL void 3433 cvProjectPCA( const CvArr* data_arr, const CvArr* avg_arr, 3434 const CvArr* eigenvects, CvArr* result_arr ) 3435 { 3436 cv::Mat data = cv::cvarrToMat(data_arr), mean = cv::cvarrToMat(avg_arr); 3437 cv::Mat evects = cv::cvarrToMat(eigenvects), dst0 = cv::cvarrToMat(result_arr), dst = dst0; 3438 3439 cv::PCA pca; 3440 pca.mean = mean; 3441 int n; 3442 if( mean.rows == 1 ) 3443 { 3444 CV_Assert(dst.cols <= evects.rows && dst.rows == data.rows); 3445 n = dst.cols; 3446 } 3447 else 3448 { 3449 CV_Assert(dst.rows <= evects.rows && dst.cols == data.cols); 3450 n = dst.rows; 3451 } 3452 pca.eigenvectors = evects.rowRange(0, n); 3453 3454 cv::Mat result = pca.project(data); 3455 if( result.cols != dst.cols ) 3456 result = result.reshape(1, 1); 3457 result.convertTo(dst, dst.type()); 3458 3459 CV_Assert(dst0.data == dst.data); 3460 } 3461 3462 3463 CV_IMPL void 3464 cvBackProjectPCA( const CvArr* proj_arr, const CvArr* avg_arr, 3465 const CvArr* eigenvects, CvArr* result_arr ) 3466 { 3467 cv::Mat data = cv::cvarrToMat(proj_arr), mean = cv::cvarrToMat(avg_arr); 3468 cv::Mat evects = cv::cvarrToMat(eigenvects), dst0 = cv::cvarrToMat(result_arr), dst = dst0; 3469 3470 cv::PCA pca; 3471 pca.mean = mean; 3472 int n; 3473 if( mean.rows == 1 ) 3474 { 3475 CV_Assert(data.cols <= evects.rows && dst.rows == data.rows); 3476 n = data.cols; 3477 } 3478 else 3479 { 3480 CV_Assert(data.rows <= evects.rows && dst.cols == data.cols); 3481 n = data.rows; 3482 } 3483 pca.eigenvectors = evects.rowRange(0, n); 3484 3485 cv::Mat result = pca.backProject(data); 3486 result.convertTo(dst, dst.type()); 3487 3488 CV_Assert(dst0.data == dst.data); 3489 } 3490 3491 /* End of file. */ 3492