Home | History | Annotate | Download | only in src
      1 /*M///////////////////////////////////////////////////////////////////////////////////////
      2 //
      3 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
      4 //
      5 //  By downloading, copying, installing or using the software you agree to this license.
      6 //  If you do not agree to this license, do not download, install,
      7 //  copy or use the software.
      8 //
      9 //
     10 //                        Intel License Agreement
     11 //                For Open Source Computer Vision Library
     12 //
     13 // Copyright (C) 2000, Intel Corporation, all rights reserved.
     14 // Third party copyrights are property of their respective owners.
     15 //
     16 // Redistribution and use in source and binary forms, with or without modification,
     17 // are permitted provided that the following conditions are met:
     18 //
     19 //   * Redistribution's of source code must retain the above copyright notice,
     20 //     this list of conditions and the following disclaimer.
     21 //
     22 //   * Redistribution's in binary form must reproduce the above copyright notice,
     23 //     this list of conditions and the following disclaimer in the documentation
     24 //     and/or other materials provided with the distribution.
     25 //
     26 //   * The name of Intel Corporation may not be used to endorse or promote products
     27 //     derived from this software without specific prior written permission.
     28 //
     29 // This software is provided by the copyright holders and contributors "as is" and
     30 // any express or implied warranties, including, but not limited to, the implied
     31 // warranties of merchantability and fitness for a particular purpose are disclaimed.
     32 // In no event shall the Intel Corporation or contributors be liable for any direct,
     33 // indirect, incidental, special, exemplary, or consequential damages
     34 // (including, but not limited to, procurement of substitute goods or services;
     35 // loss of use, data, or profits; or business interruption) however caused
     36 // and on any theory of liability, whether in contract, strict liability,
     37 // or tort (including negligence or otherwise) arising in any way out of
     38 // the use of this software, even if advised of the possibility of such damage.
     39 //
     40 //M*/
     41 
     42 #include "_cxcore.h"
     43 
     44 /****************************************************************************************\
     45 *                                         cvGEMM                                         *
     46 \****************************************************************************************/
     47 
     48 icvBLAS_GEMM_32f_t icvBLAS_GEMM_32f_p = 0;
     49 icvBLAS_GEMM_64f_t icvBLAS_GEMM_64f_p = 0;
     50 icvBLAS_GEMM_32fc_t icvBLAS_GEMM_32fc_p = 0;
     51 icvBLAS_GEMM_64fc_t icvBLAS_GEMM_64fc_p = 0;
     52 
     53 static void
     54 icvGEMM_CopyBlock( const uchar* src, int src_step,
     55                    uchar* dst, int dst_step,
     56                    CvSize size, int pix_size )
     57 {
     58     int j;
     59     size.width = size.width * (pix_size / sizeof(int));
     60 
     61     for( ; size.height--; src += src_step, dst += dst_step )
     62     {
     63         for( j = 0; j <= size.width - 4; j += 4 )
     64         {
     65             int t0 = ((const int*)src)[j];
     66             int t1 = ((const int*)src)[j+1];
     67             ((int*)dst)[j] = t0;
     68             ((int*)dst)[j+1] = t1;
     69             t0 = ((const int*)src)[j+2];
     70             t1 = ((const int*)src)[j+3];
     71             ((int*)dst)[j+2] = t0;
     72             ((int*)dst)[j+3] = t1;
     73         }
     74 
     75         for( ; j < size.width; j++ )
     76             ((int*)dst)[j] = ((const int*)src)[j];
     77     }
     78 }
     79 
     80 
     81 static void
     82 icvGEMM_TransposeBlock( const uchar* src, int src_step,
     83                         uchar* dst, int dst_step,
     84                         CvSize size, int pix_size )
     85 {
     86     int i, j;
     87     for( i = 0; i < size.width; i++, dst += dst_step, src += pix_size )
     88     {
     89         const uchar* _src = src;
     90         switch( pix_size )
     91         {
     92         case sizeof(int):
     93             for( j = 0; j < size.height; j++, _src += src_step )
     94                 ((int*)dst)[j] = ((int*)_src)[0];
     95             break;
     96         case sizeof(int)*2:
     97             for( j = 0; j < size.height*2; j += 2, _src += src_step )
     98             {
     99                 int t0 = ((int*)_src)[0];
    100                 int t1 = ((int*)_src)[1];
    101                 ((int*)dst)[j] = t0;
    102                 ((int*)dst)[j+1] = t1;
    103             }
    104             break;
    105         case sizeof(int)*4:
    106             for( j = 0; j < size.height*4; j += 4, _src += src_step )
    107             {
    108                 int t0 = ((int*)_src)[0];
    109                 int t1 = ((int*)_src)[1];
    110                 ((int*)dst)[j] = t0;
    111                 ((int*)dst)[j+1] = t1;
    112                 t0 = ((int*)_src)[2];
    113                 t1 = ((int*)_src)[3];
    114                 ((int*)dst)[j+2] = t0;
    115                 ((int*)dst)[j+3] = t1;
    116             }
    117             break;
    118         default:
    119             assert(0);
    120             return;
    121         }
    122     }
    123 }
    124 
    125 #define ICV_DEF_GEMM_SINGLE_MUL( flavor, arrtype, worktype )                \
    126 static CvStatus CV_STDCALL                                                  \
    127 icvGEMMSingleMul_##flavor( const arrtype* a_data, size_t a_step,            \
    128                          const arrtype* b_data, size_t b_step,              \
    129                          const arrtype* c_data, size_t c_step,              \
    130                          arrtype* d_data, size_t d_step,                    \
    131                          CvSize a_size, CvSize d_size,                      \
    132                          double alpha, double beta, int flags )             \
    133 {                                                                           \
    134     int i, j, k, n = a_size.width, m = d_size.width, drows = d_size.height; \
    135     const arrtype *_a_data = a_data, *_b_data = b_data, *_c_data = c_data;  \
    136     arrtype* a_buf = 0;                                                     \
    137     size_t a_step0, a_step1, c_step0, c_step1, t_step;                      \
    138                                                                             \
    139     a_step /= sizeof(a_data[0]);                                            \
    140     b_step /= sizeof(b_data[0]);                                            \
    141     c_step /= sizeof(c_data[0]);                                            \
    142     d_step /= sizeof(d_data[0]);                                            \
    143     a_step0 = a_step;                                                       \
    144     a_step1 = 1;                                                            \
    145                                                                             \
    146     if( !c_data )                                                           \
    147         c_step0 = c_step1 = 0;                                              \
    148     else if( !(flags & CV_GEMM_C_T) )                                       \
    149         c_step0 = c_step, c_step1 = 1;                                      \
    150     else                                                                    \
    151         c_step0 = 1, c_step1 = c_step;                                      \
    152                                                                             \
    153     if( flags & CV_GEMM_A_T )                                               \
    154     {                                                                       \
    155         CV_SWAP( a_step0, a_step1, t_step );                                \
    156         n = a_size.height;                                                  \
    157         if( a_step > 1 && n > 1 )                                           \
    158             a_buf = (arrtype*)cvStackAlloc(n*sizeof(a_data[0]));            \
    159     }                                                                       \
    160                                                                             \
    161     if( n == 1 ) /* external product */                                     \
    162     {                                                                       \
    163         arrtype* b_buf = 0;                                                 \
    164                                                                             \
    165         if( a_step > 1 )                                                    \
    166         {                                                                   \
    167             a_buf = (arrtype*)cvStackAlloc(drows*sizeof(a_data[0]));        \
    168             for( k = 0; k < drows; k++ )                                    \
    169                 a_buf[k] = a_data[a_step*k];                                \
    170             a_data = a_buf;                                                 \
    171         }                                                                   \
    172                                                                             \
    173         if( b_step > 1 )                                                    \
    174         {                                                                   \
    175             b_buf = (arrtype*)cvStackAlloc(d_size.width*sizeof(b_buf[0]) ); \
    176             for( j = 0; j < d_size.width; j++ )                             \
    177                 b_buf[j] = b_data[j*b_step];                                \
    178             b_data = b_buf;                                                 \
    179         }                                                                   \
    180                                                                             \
    181         for( i = 0; i < drows; i++, _c_data += c_step0,                     \
    182                                     d_data += d_step )                      \
    183         {                                                                   \
    184             worktype al = worktype(a_data[i])*alpha;                        \
    185             c_data = _c_data;                                               \
    186             for( j = 0; j <= d_size.width - 2; j += 2, c_data += 2*c_step1 )\
    187             {                                                               \
    188                 worktype s0 = al*b_data[j];                                 \
    189                 worktype s1 = al*b_data[j+1];                               \
    190                 if( !c_data )                                               \
    191                 {                                                           \
    192                     d_data[j] = arrtype(s0);                                \
    193                     d_data[j+1] = arrtype(s1);                              \
    194                 }                                                           \
    195                 else                                                        \
    196                 {                                                           \
    197                     d_data[j] = arrtype(s0 + c_data[0]*beta);               \
    198                     d_data[j+1] = arrtype(s1 + c_data[c_step1]*beta);       \
    199                 }                                                           \
    200             }                                                               \
    201                                                                             \
    202             for( ; j < d_size.width; j++, c_data += c_step1 )               \
    203             {                                                               \
    204                 worktype s0 = al*b_data[j];                                 \
    205                 if( !c_data )                                               \
    206                     d_data[j] = arrtype(s0);                                \
    207                 else                                                        \
    208                     d_data[j] = arrtype(s0 + c_data[0]*beta);               \
    209             }                                                               \
    210         }                                                                   \
    211     }                                                                       \
    212     else if( flags & CV_GEMM_B_T ) /* A * Bt */                             \
    213     {                                                                       \
    214         for( i = 0; i < drows; i++, _a_data += a_step0,                     \
    215                                     _c_data += c_step0,                     \
    216                                     d_data += d_step )                      \
    217         {                                                                   \
    218             a_data = _a_data;                                               \
    219             b_data = _b_data;                                               \
    220             c_data = _c_data;                                               \
    221                                                                             \
    222             if( a_buf )                                                     \
    223             {                                                               \
    224                 for( k = 0; k < n; k++ )                                    \
    225                     a_buf[k] = a_data[a_step1*k];                           \
    226                 a_data = a_buf;                                             \
    227             }                                                               \
    228                                                                             \
    229             for( j = 0; j < d_size.width; j++, b_data += b_step,            \
    230                                                c_data += c_step1 )          \
    231             {                                                               \
    232                 worktype s0(0), s1(0), s2(0), s3(0);                        \
    233                                                                             \
    234                 for( k = 0; k <= n - 4; k += 4 )                            \
    235                 {                                                           \
    236                     s0 += worktype(a_data[k])*b_data[k];                    \
    237                     s1 += worktype(a_data[k+1])*b_data[k+1];                \
    238                     s2 += worktype(a_data[k+2])*b_data[k+2];                \
    239                     s3 += worktype(a_data[k+3])*b_data[k+3];                \
    240                 }                                                           \
    241                                                                             \
    242                 for( ; k < n; k++ )                                         \
    243                     s0 += worktype(a_data[k])*b_data[k];                    \
    244                 s0 = (s0+s1+s2+s3)*alpha;                                   \
    245                                                                             \
    246                 if( !c_data )                                               \
    247                     d_data[j] = arrtype(s0);                                \
    248                 else                                                        \
    249                     d_data[j] = arrtype(s0 + c_data[0]*beta);               \
    250             }                                                               \
    251         }                                                                   \
    252     }                                                                       \
    253     else if( d_size.width*sizeof(d_data[0]) <= 1600 )                       \
    254     {                                                                       \
    255         for( i = 0; i < drows; i++, _a_data += a_step0,                     \
    256                                     _c_data += c_step0,                     \
    257                                     d_data += d_step )                      \
    258         {                                                                   \
    259             a_data = _a_data, c_data = _c_data;                             \
    260                                                                             \
    261             if( a_buf )                                                     \
    262             {                                                               \
    263                 for( k = 0; k < n; k++ )                                    \
    264                     a_buf[k] = a_data[a_step1*k];                           \
    265                 a_data = a_buf;                                             \
    266             }                                                               \
    267                                                                             \
    268             for( j = 0; j <= m - 4; j += 4, c_data += 4*c_step1 )           \
    269             {                                                               \
    270                 const arrtype* b = _b_data + j;                             \
    271                 worktype s0(0), s1(0), s2(0), s3(0);                        \
    272                                                                             \
    273                 for( k = 0; k < n; k++, b += b_step )                       \
    274                 {                                                           \
    275                     worktype a(a_data[k]);                                  \
    276                     s0 += a * b[0]; s1 += a * b[1];                         \
    277                     s2 += a * b[2]; s3 += a * b[3];                         \
    278                 }                                                           \
    279                                                                             \
    280                 if( !c_data )                                               \
    281                 {                                                           \
    282                     d_data[j] = arrtype(s0*alpha);                          \
    283                     d_data[j+1] = arrtype(s1*alpha);                        \
    284                     d_data[j+2] = arrtype(s2*alpha);                        \
    285                     d_data[j+3] = arrtype(s3*alpha);                        \
    286                 }                                                           \
    287                 else                                                        \
    288                 {                                                           \
    289                     s0 = s0*alpha; s1 = s1*alpha;                           \
    290                     s2 = s2*alpha; s3 = s3*alpha;                           \
    291                     d_data[j] = arrtype(s0 + c_data[0]*beta);               \
    292                     d_data[j+1] = arrtype(s1 + c_data[c_step1]*beta);       \
    293                     d_data[j+2] = arrtype(s2 + c_data[c_step1*2]*beta);     \
    294                     d_data[j+3] = arrtype(s3 + c_data[c_step1*3]*beta);     \
    295                 }                                                           \
    296             }                                                               \
    297                                                                             \
    298             for( ; j < m; j++, c_data += c_step1 )                          \
    299             {                                                               \
    300                 const arrtype* b = _b_data + j;                             \
    301                 worktype s0(0);                                             \
    302                                                                             \
    303                 for( k = 0; k < n; k++, b += b_step )                       \
    304                     s0 += worktype(a_data[k]) * b[0];                       \
    305                                                                             \
    306                 s0 = s0*alpha;                                              \
    307                 if( !c_data )                                               \
    308                     d_data[j] = arrtype(s0);                                \
    309                 else                                                        \
    310                     d_data[j] = arrtype(s0 + c_data[0]*beta);               \
    311             }                                                               \
    312         }                                                                   \
    313     }                                                                       \
    314     else                                                                    \
    315     {                                                                       \
    316         worktype* d_buf = (worktype*)cvStackAlloc(m*sizeof(d_buf[0]));      \
    317                                                                             \
    318         for( i = 0; i < drows; i++, _a_data += a_step0,                     \
    319                                             _c_data += c_step0,             \
    320                                             d_data += d_step )              \
    321         {                                                                   \
    322             a_data = _a_data;                                               \
    323             b_data = _b_data;                                               \
    324             c_data = _c_data;                                               \
    325                                                                             \
    326             if( a_buf )                                                     \
    327             {                                                               \
    328                 for( k = 0; k < n; k++ )                                    \
    329                     a_buf[k] = _a_data[a_step1*k];                          \
    330                 a_data = a_buf;                                             \
    331             }                                                               \
    332                                                                             \
    333             for( j = 0; j < m; j++ )                                        \
    334                 d_buf[j] = worktype(0);                                     \
    335                                                                             \
    336             for( k = 0; k < n; k++, b_data += b_step )                      \
    337             {                                                               \
    338                 worktype al(a_data[k]);                                     \
    339                                                                             \
    340                 for( j = 0; j <= m - 4; j += 4 )                            \
    341                 {                                                           \
    342                     worktype t0 = d_buf[j] + b_data[j]*al;                  \
    343                     worktype t1 = d_buf[j+1] + b_data[j+1]*al;              \
    344                     d_buf[j] = t0;                                          \
    345                     d_buf[j+1] = t1;                                        \
    346                     t0 = d_buf[j+2] + b_data[j+2]*al;                       \
    347                     t1 = d_buf[j+3] + b_data[j+3]*al;                       \
    348                     d_buf[j+2] = t0;                                        \
    349                     d_buf[j+3] = t1;                                        \
    350                 }                                                           \
    351                                                                             \
    352                 for( ; j < m; j++ )                                         \
    353                     d_buf[j] += b_data[j]*al;                               \
    354             }                                                               \
    355                                                                             \
    356             if( !c_data )                                                   \
    357                 for( j = 0; j < m; j++ )                                    \
    358                     d_data[j] = arrtype(d_buf[j]*alpha);                    \
    359             else                                                            \
    360                 for( j = 0; j < m; j++, c_data += c_step1 )                 \
    361                 {                                                           \
    362                     worktype t = d_buf[j]*alpha;                            \
    363                     d_data[j] = arrtype(t + c_data[0]*beta);                \
    364                 }                                                           \
    365         }                                                                   \
    366     }                                                                       \
    367     return CV_OK;                                                           \
    368 }
    369 
    370 
    371 #define ICV_DEF_GEMM_BLOCK_MUL( flavor, arrtype, worktype )         \
    372 static CvStatus CV_STDCALL                                          \
    373 icvGEMMBlockMul_##flavor( const arrtype* a_data, size_t a_step,     \
    374                         const arrtype* b_data, size_t b_step,       \
    375                         worktype* d_data, size_t d_step,            \
    376                         CvSize a_size, CvSize d_size, int flags )   \
    377 {                                                                   \
    378     int i, j, k, n = a_size.width, m = d_size.width;                \
    379     const arrtype *_a_data = a_data, *_b_data = b_data;             \
    380     arrtype* a_buf = 0;                                             \
    381     size_t a_step0, a_step1, t_step;                                \
    382     int do_acc = flags & 16;                                        \
    383                                                                     \
    384     a_step /= sizeof(a_data[0]);                                    \
    385     b_step /= sizeof(b_data[0]);                                    \
    386     d_step /= sizeof(d_data[0]);                                    \
    387                                                                     \
    388     a_step0 = a_step;                                               \
    389     a_step1 = 1;                                                    \
    390                                                                     \
    391     if( flags & CV_GEMM_A_T )                                       \
    392     {                                                               \
    393         CV_SWAP( a_step0, a_step1, t_step );                        \
    394         n = a_size.height;                                          \
    395         a_buf = (arrtype*)cvStackAlloc(n*sizeof(a_data[0]));        \
    396     }                                                               \
    397                                                                     \
    398     if( flags & CV_GEMM_B_T )                                       \
    399     {                                                               \
    400         /* second operand is transposed */                          \
    401         for( i = 0; i < d_size.height; i++, _a_data += a_step0,     \
    402                                             d_data += d_step )      \
    403         {                                                           \
    404             a_data = _a_data; b_data = _b_data;                     \
    405                                                                     \
    406             if( a_buf )                                             \
    407             {                                                       \
    408                 for( k = 0; k < n; k++ )                            \
    409                     a_buf[k] = a_data[a_step1*k];                   \
    410                 a_data = a_buf;                                     \
    411             }                                                       \
    412                                                                     \
    413             for( j = 0; j < d_size.width; j++, b_data += b_step )   \
    414             {                                                       \
    415                 worktype s0 = do_acc ? d_data[j]:worktype(0), s1(0);\
    416                 for( k = 0; k <= n - 2; k += 2 )                    \
    417                 {                                                   \
    418                     s0 += worktype(a_data[k])*b_data[k];            \
    419                     s1 += worktype(a_data[k+1])*b_data[k+1];        \
    420                 }                                                   \
    421                                                                     \
    422                 for( ; k < n; k++ )                                 \
    423                     s0 += worktype(a_data[k])*b_data[k];            \
    424                                                                     \
    425                 d_data[j] = s0 + s1;                                \
    426             }                                                       \
    427         }                                                           \
    428     }                                                               \
    429     else                                                            \
    430     {                                                               \
    431         for( i = 0; i < d_size.height; i++, _a_data += a_step0,     \
    432                                             d_data += d_step )      \
    433         {                                                           \
    434             a_data = _a_data, b_data = _b_data;                     \
    435                                                                     \
    436             if( a_buf )                                             \
    437             {                                                       \
    438                 for( k = 0; k < n; k++ )                            \
    439                     a_buf[k] = a_data[a_step1*k];                   \
    440                 a_data = a_buf;                                     \
    441             }                                                       \
    442                                                                     \
    443             for( j = 0; j <= m - 4; j += 4 )                        \
    444             {                                                       \
    445                 worktype s0, s1, s2, s3;                            \
    446                 const arrtype* b = b_data + j;                      \
    447                                                                     \
    448                 if( do_acc )                                        \
    449                 {                                                   \
    450                     s0 = d_data[j]; s1 = d_data[j+1];               \
    451                     s2 = d_data[j+2]; s3 = d_data[j+3];             \
    452                 }                                                   \
    453                 else                                                \
    454                     s0 = s1 = s2 = s3 = worktype(0);                \
    455                                                                     \
    456                 for( k = 0; k < n; k++, b += b_step )               \
    457                 {                                                   \
    458                     worktype a(a_data[k]);                          \
    459                     s0 += a * b[0]; s1 += a * b[1];                 \
    460                     s2 += a * b[2]; s3 += a * b[3];                 \
    461                 }                                                   \
    462                                                                     \
    463                 d_data[j] = s0; d_data[j+1] = s1;                   \
    464                 d_data[j+2] = s2; d_data[j+3] = s3;                 \
    465             }                                                       \
    466                                                                     \
    467             for( ; j < m; j++ )                                     \
    468             {                                                       \
    469                 const arrtype* b = b_data + j;                      \
    470                 worktype s0 = do_acc ? d_data[j] : worktype(0);     \
    471                                                                     \
    472                 for( k = 0; k < n; k++, b += b_step )               \
    473                     s0 += worktype(a_data[k]) * b[0];               \
    474                                                                     \
    475                 d_data[j] = s0;                                     \
    476             }                                                       \
    477         }                                                           \
    478     }                                                               \
    479                                                                     \
    480     return CV_OK;                                                   \
    481 }
    482 
    483 
    484 #define ICV_DEF_GEMM_STORE( flavor, arrtype, worktype )             \
    485 static CvStatus CV_STDCALL                                          \
    486 icvGEMMStore_##flavor( const arrtype* c_data, size_t c_step,        \
    487                        const worktype* d_buf, size_t d_buf_step,    \
    488                        arrtype* d_data, size_t d_step, CvSize d_size,\
    489                        double alpha, double beta, int flags )       \
    490 {                                                                   \
    491     const arrtype* _c_data = c_data;                                \
    492     int j;                                                          \
    493     size_t c_step0, c_step1;                                        \
    494                                                                     \
    495     c_step /= sizeof(c_data[0]);                                    \
    496     d_buf_step /= sizeof(d_buf[0]);                                 \
    497     d_step /= sizeof(d_data[0]);                                    \
    498                                                                     \
    499     if( !c_data )                                                   \
    500         c_step0 = c_step1 = 0;                                      \
    501     else if( !(flags & CV_GEMM_C_T) )                               \
    502         c_step0 = c_step, c_step1 = 1;                              \
    503     else                                                            \
    504         c_step0 = 1, c_step1 = c_step;                              \
    505                                                                     \
    506     for( ; d_size.height--; _c_data += c_step0,                     \
    507                             d_buf += d_buf_step,                    \
    508                             d_data += d_step )                      \
    509     {                                                               \
    510         if( _c_data )                                               \
    511         {                                                           \
    512             c_data = _c_data;                                       \
    513             for( j = 0; j <= d_size.width - 4; j += 4, c_data += 4*c_step1 )\
    514             {                                                       \
    515                 worktype t0 = alpha*d_buf[j];                       \
    516                 worktype t1 = alpha*d_buf[j+1];                     \
    517                 t0 += beta*worktype(c_data[0]);                     \
    518                 t1 += beta*worktype(c_data[c_step1]);               \
    519                 d_data[j] = arrtype(t0);                            \
    520                 d_data[j+1] = arrtype(t1);                          \
    521                 t0 = alpha*d_buf[j+2];                              \
    522                 t1 = alpha*d_buf[j+3];                              \
    523                 t0 += beta*worktype(c_data[c_step1*2]);             \
    524                 t1 += beta*worktype(c_data[c_step1*3]);             \
    525                 d_data[j+2] = arrtype(t0);                          \
    526                 d_data[j+3] = arrtype(t1);                          \
    527             }                                                       \
    528             for( ; j < d_size.width; j++, c_data += c_step1 )       \
    529             {                                                       \
    530                 worktype t0 = alpha*d_buf[j];                       \
    531                 d_data[j] = arrtype(t0 + beta*c_data[0]);           \
    532             }                                                       \
    533         }                                                           \
    534         else                                                        \
    535         {                                                           \
    536             for( j = 0; j <= d_size.width - 4; j += 4 )             \
    537             {                                                       \
    538                 worktype t0 = alpha*d_buf[j];                       \
    539                 worktype t1 = alpha*d_buf[j+1];                     \
    540                 d_data[j] = arrtype(t0);                            \
    541                 d_data[j+1] = arrtype(t1);                          \
    542                 t0 = alpha*d_buf[j+2];                              \
    543                 t1 = alpha*d_buf[j+3];                              \
    544                 d_data[j+2] = arrtype(t0);                          \
    545                 d_data[j+3] = arrtype(t1);                          \
    546             }                                                       \
    547             for( ; j < d_size.width; j++ )                          \
    548                 d_data[j] = arrtype(alpha*d_buf[j]);                \
    549         }                                                           \
    550     }                                                               \
    551     return CV_OK;                                                   \
    552 }
    553 
    554 
    555 ICV_DEF_GEMM_SINGLE_MUL( 32f_C1R, float, double)
    556 ICV_DEF_GEMM_BLOCK_MUL( 32f_C1R, float, double)
    557 ICV_DEF_GEMM_STORE( 32f_C1R, float, double)
    558 
    559 ICV_DEF_GEMM_SINGLE_MUL( 64f_C1R, double, double)
    560 ICV_DEF_GEMM_BLOCK_MUL( 64f_C1R, double, double)
    561 ICV_DEF_GEMM_STORE( 64f_C1R, double, double)
    562 
    563 ICV_DEF_GEMM_SINGLE_MUL( 32f_C2R, CvComplex32f, CvComplex64f)
    564 ICV_DEF_GEMM_BLOCK_MUL( 32f_C2R, CvComplex32f, CvComplex64f)
    565 ICV_DEF_GEMM_STORE( 32f_C2R, CvComplex32f, CvComplex64f)
    566 
    567 ICV_DEF_GEMM_SINGLE_MUL( 64f_C2R, CvComplex64f, CvComplex64f)
    568 ICV_DEF_GEMM_BLOCK_MUL( 64f_C2R, CvComplex64f, CvComplex64f)
    569 ICV_DEF_GEMM_STORE( 64f_C2R, CvComplex64f, CvComplex64f)
    570 
    571 typedef CvStatus (CV_STDCALL *CvGEMMSingleMulFunc)( const void* src1, size_t step1,
    572                    const void* src2, size_t step2, const void* src3, size_t step3,
    573                    void* dst, size_t dststep, CvSize srcsize, CvSize dstsize,
    574                    double alpha, double beta, int flags );
    575 
    576 typedef CvStatus (CV_STDCALL *CvGEMMBlockMulFunc)( const void* src1, size_t step1,
    577                    const void* src2, size_t step2, void* dst, size_t dststep,
    578                    CvSize srcsize, CvSize dstsize, int flags );
    579 
    580 typedef CvStatus (CV_STDCALL *CvGEMMStoreFunc)( const void* src1, size_t step1,
    581                    const void* src2, size_t step2, void* dst, size_t dststep,
    582                    CvSize dstsize, double alpha, double beta, int flags );
    583 
    584 
    585 static void icvInitGEMMTable( CvBigFuncTable* single_mul_tab,
    586                               CvBigFuncTable* block_mul_tab,
    587                               CvBigFuncTable* store_tab )
    588 {
    589     single_mul_tab->fn_2d[CV_32FC1] = (void*)icvGEMMSingleMul_32f_C1R;
    590     single_mul_tab->fn_2d[CV_64FC1] = (void*)icvGEMMSingleMul_64f_C1R;
    591     single_mul_tab->fn_2d[CV_32FC2] = (void*)icvGEMMSingleMul_32f_C2R;
    592     single_mul_tab->fn_2d[CV_64FC2] = (void*)icvGEMMSingleMul_64f_C2R;
    593     block_mul_tab->fn_2d[CV_32FC1] = (void*)icvGEMMBlockMul_32f_C1R;
    594     block_mul_tab->fn_2d[CV_64FC1] = (void*)icvGEMMBlockMul_64f_C1R;
    595     block_mul_tab->fn_2d[CV_32FC2] = (void*)icvGEMMBlockMul_32f_C2R;
    596     block_mul_tab->fn_2d[CV_64FC2] = (void*)icvGEMMBlockMul_64f_C2R;
    597     store_tab->fn_2d[CV_32FC1] = (void*)icvGEMMStore_32f_C1R;
    598     store_tab->fn_2d[CV_64FC1] = (void*)icvGEMMStore_64f_C1R;
    599     store_tab->fn_2d[CV_32FC2] = (void*)icvGEMMStore_32f_C2R;
    600     store_tab->fn_2d[CV_64FC2] = (void*)icvGEMMStore_64f_C2R;
    601 }
    602 
    603 
    604 CV_IMPL void
    605 cvGEMM( const CvArr* Aarr, const CvArr* Barr, double alpha,
    606         const CvArr* Carr, double beta, CvArr* Darr, int flags )
    607 {
    608     const int block_lin_size = 128;
    609     const int block_size = block_lin_size * block_lin_size;
    610 
    611     static CvBigFuncTable single_mul_tab, block_mul_tab, store_tab;
    612     static int inittab = 0;
    613     static double zero[] = {0,0,0,0};
    614     static float zerof[] = {0,0,0,0};
    615 
    616     uchar* buffer = 0;
    617     int local_alloc = 0;
    618     uchar* block_buffer = 0;
    619 
    620     CV_FUNCNAME( "cvGEMM" );
    621 
    622     __BEGIN__;
    623 
    624     CvMat *A = (CvMat*)Aarr;
    625     CvMat *B = (CvMat*)Barr;
    626     CvMat *C = (CvMat*)Carr;
    627     CvMat *D = (CvMat*)Darr;
    628     int len = 0;
    629 
    630     CvMat stub, stub1, stub2, stub3;
    631     CvSize a_size, d_size;
    632     int type;
    633 
    634     if( !CV_IS_MAT( A ))
    635     {
    636         int coi = 0;
    637         CV_CALL( A = cvGetMat( A, &stub1, &coi ));
    638 
    639         if( coi != 0 )
    640             CV_ERROR( CV_BadCOI, "" );
    641     }
    642 
    643     if( !CV_IS_MAT( B ))
    644     {
    645         int coi = 0;
    646         CV_CALL( B = cvGetMat( B, &stub2, &coi ));
    647 
    648         if( coi != 0 )
    649             CV_ERROR( CV_BadCOI, "" );
    650     }
    651 
    652     if( !CV_IS_MAT( D ))
    653     {
    654         int coi = 0;
    655         CV_CALL( D = cvGetMat( D, &stub, &coi ));
    656 
    657         if( coi != 0 )
    658             CV_ERROR( CV_BadCOI, "" );
    659     }
    660 
    661     if( beta == 0 )
    662         C = 0;
    663 
    664     if( C )
    665     {
    666         if( !CV_IS_MAT( C ))
    667         {
    668             int coi = 0;
    669             CV_CALL( C = cvGetMat( C, &stub3, &coi ));
    670 
    671             if( coi != 0 )
    672                 CV_ERROR( CV_BadCOI, "" );
    673         }
    674 
    675         if( !CV_ARE_TYPES_EQ( C, D ))
    676             CV_ERROR( CV_StsUnmatchedFormats, "" );
    677 
    678         if( ((flags&CV_GEMM_C_T) == 0 && (C->cols != D->cols || C->rows != D->rows)) ||
    679             ((flags&CV_GEMM_C_T) != 0 && (C->rows != D->cols || C->cols != D->rows)))
    680             CV_ERROR( CV_StsUnmatchedSizes, "" );
    681 
    682         if( (flags & CV_GEMM_C_T) != 0 && C->data.ptr == D->data.ptr )
    683         {
    684             cvTranspose( C, D );
    685             C = D;
    686             flags &= ~CV_GEMM_C_T;
    687         }
    688     }
    689     else
    690     {
    691         C = &stub3;
    692         C->data.ptr = 0;
    693         C->step = 0;
    694         C->type = CV_MAT_CONT_FLAG;
    695     }
    696 
    697     type = CV_MAT_TYPE(A->type);
    698     if( !CV_ARE_TYPES_EQ( A, B ) || !CV_ARE_TYPES_EQ( A, D ) )
    699         CV_ERROR( CV_StsUnmatchedFormats, "" );
    700 
    701     a_size.width = A->cols;
    702     a_size.height = A->rows;
    703     d_size.width = D->cols;
    704     d_size.height = D->rows;
    705 
    706     switch( flags & (CV_GEMM_A_T|CV_GEMM_B_T) )
    707     {
    708     case 0:
    709         len = B->rows;
    710         if( a_size.width != len ||
    711             B->cols != d_size.width ||
    712             a_size.height != d_size.height )
    713             CV_ERROR( CV_StsUnmatchedSizes, "" );
    714         break;
    715     case 1:
    716         len = B->rows;
    717         if( a_size.height != len ||
    718             B->cols != d_size.width ||
    719             a_size.width != d_size.height )
    720             CV_ERROR( CV_StsUnmatchedSizes, "" );
    721         break;
    722     case 2:
    723         len = B->cols;
    724         if( a_size.width != len ||
    725             B->rows != d_size.width ||
    726             a_size.height != d_size.height )
    727             CV_ERROR( CV_StsUnmatchedSizes, "" );
    728         break;
    729     case 3:
    730         len = B->cols;
    731         if( a_size.height != len ||
    732             B->rows != d_size.width ||
    733             a_size.width != d_size.height )
    734             CV_ERROR( CV_StsUnmatchedSizes, "" );
    735         break;
    736     }
    737 
    738     if( flags == 0 && 2 <= len && len <= 4 && (len == d_size.width || len == d_size.height) )
    739     {
    740         int i;
    741         if( type == CV_64F )
    742         {
    743             double* d = D->data.db;
    744             const double *a = A->data.db, *b = B->data.db, *c = C->data.db;
    745             size_t d_step = D->step/sizeof(d[0]),
    746                    a_step = A->step/sizeof(a[0]),
    747                    b_step = B->step/sizeof(b[0]),
    748                    c_step = C->step/sizeof(c[0]);
    749 
    750             if( !c )
    751                 c = zero;
    752 
    753             switch( len )
    754             {
    755             case 2:
    756                 if( len == d_size.width && b != d )
    757                 {
    758                     for( i = 0; i < d_size.height; i++, d += d_step, a += a_step, c += c_step )
    759                     {
    760                         double t0 = a[0]*b[0] + a[1]*b[b_step];
    761                         double t1 = a[0]*b[1] + a[1]*b[b_step+1];
    762                         d[0] = t0*alpha + c[0]*beta;
    763                         d[1] = t1*alpha + c[1]*beta;
    764                     }
    765                 }
    766                 else if( a != d )
    767                 {
    768                     int c_step0 = 1;
    769                     if( c == zero )
    770                     {
    771                         c_step0 = 0;
    772                         c_step = 1;
    773                     }
    774 
    775                     for( i = 0; i < d_size.width; i++, d++, b++, c += c_step0 )
    776                     {
    777                         double t0 = a[0]*b[0] + a[1]*b[b_step];
    778                         double t1 = a[a_step]*b[0] + a[a_step+1]*b[b_step];
    779                         d[0] = t0*alpha + c[0]*beta;
    780                         d[d_step] = t1*alpha + c[c_step]*beta;
    781                     }
    782                 }
    783                 else
    784                     break;
    785                 EXIT;
    786             case 3:
    787                 if( len == d_size.width && b != d )
    788                 {
    789                     for( i = 0; i < d_size.height; i++, d += d_step, a += a_step, c += c_step )
    790                     {
    791                         double t0 = a[0]*b[0] + a[1]*b[b_step] + a[2]*b[b_step*2];
    792                         double t1 = a[0]*b[1] + a[1]*b[b_step+1] + a[2]*b[b_step*2+1];
    793                         double t2 = a[0]*b[2] + a[1]*b[b_step+2] + a[2]*b[b_step*2+2];
    794                         d[0] = t0*alpha + c[0]*beta;
    795                         d[1] = t1*alpha + c[1]*beta;
    796                         d[2] = t2*alpha + c[2]*beta;
    797                     }
    798                 }
    799                 else if( a != d )
    800                 {
    801                     int c_step0 = 1;
    802                     if( c == zero )
    803                     {
    804                         c_step0 = 0;
    805                         c_step = 1;
    806                     }
    807 
    808                     for( i = 0; i < d_size.width; i++, d++, b++, c += c_step0 )
    809                     {
    810                         double t0 = a[0]*b[0] + a[1]*b[b_step] + a[2]*b[b_step*2];
    811                         double t1 = a[a_step]*b[0] + a[a_step+1]*b[b_step] + a[a_step+2]*b[b_step*2];
    812                         double t2 = a[a_step*2]*b[0] + a[a_step*2+1]*b[b_step] + a[a_step*2+2]*b[b_step*2];
    813 
    814                         d[0] = t0*alpha + c[0]*beta;
    815                         d[d_step] = t1*alpha + c[c_step]*beta;
    816                         d[d_step*2] = t2*alpha + c[c_step*2]*beta;
    817                     }
    818                 }
    819                 else
    820                     break;
    821                 EXIT;
    822             case 4:
    823                 if( len == d_size.width && b != d )
    824                 {
    825                     for( i = 0; i < d_size.height; i++, d += d_step, a += a_step, c += c_step )
    826                     {
    827                         double t0 = a[0]*b[0] + a[1]*b[b_step] + a[2]*b[b_step*2] + a[3]*b[b_step*3];
    828                         double t1 = a[0]*b[1] + a[1]*b[b_step+1] + a[2]*b[b_step*2+1] + a[3]*b[b_step*3+1];
    829                         double t2 = a[0]*b[2] + a[1]*b[b_step+2] + a[2]*b[b_step*2+2] + a[3]*b[b_step*3+2];
    830                         double t3 = a[0]*b[3] + a[1]*b[b_step+3] + a[2]*b[b_step*2+3] + a[3]*b[b_step*3+3];
    831                         d[0] = t0*alpha + c[0]*beta;
    832                         d[1] = t1*alpha + c[1]*beta;
    833                         d[2] = t2*alpha + c[2]*beta;
    834                         d[3] = t3*alpha + c[3]*beta;
    835                     }
    836                 }
    837                 else if( d_size.width <= 16 && a != d )
    838                 {
    839                     int c_step0 = 1;
    840                     if( c == zero )
    841                     {
    842                         c_step0 = 0;
    843                         c_step = 1;
    844                     }
    845 
    846                     for( i = 0; i < d_size.width; i++, d++, b++, c += c_step0 )
    847                     {
    848                         double t0 = a[0]*b[0] + a[1]*b[b_step] + a[2]*b[b_step*2] + a[3]*b[b_step*3];
    849                         double t1 = a[a_step]*b[0] + a[a_step+1]*b[b_step] +
    850                                     a[a_step+2]*b[b_step*2] + a[a_step+3]*b[b_step*3];
    851                         double t2 = a[a_step*2]*b[0] + a[a_step*2+1]*b[b_step] +
    852                                     a[a_step*2+2]*b[b_step*2] + a[a_step*2+3]*b[b_step*3];
    853                         double t3 = a[a_step*3]*b[0] + a[a_step*3+1]*b[b_step] +
    854                                     a[a_step*3+2]*b[b_step*2] + a[a_step*3+3]*b[b_step*3];
    855                         d[0] = t0*alpha + c[0]*beta;
    856                         d[d_step] = t1*alpha + c[c_step]*beta;
    857                         d[d_step*2] = t2*alpha + c[c_step*2]*beta;
    858                         d[d_step*3] = t3*alpha + c[c_step*3]*beta;
    859                     }
    860                 }
    861                 else
    862                     break;
    863                 EXIT;
    864             }
    865         }
    866 
    867         if( type == CV_32F )
    868         {
    869             float* d = D->data.fl;
    870             const float *a = A->data.fl, *b = B->data.fl, *c = C->data.fl;
    871             size_t d_step = D->step/sizeof(d[0]),
    872                    a_step = A->step/sizeof(a[0]),
    873                    b_step = B->step/sizeof(b[0]),
    874                    c_step = C->step/sizeof(c[0]);
    875 
    876             if( !c )
    877                 c = zerof;
    878 
    879             switch( len )
    880             {
    881             case 2:
    882                 if( len == d_size.width && b != d )
    883                 {
    884                     for( i = 0; i < d_size.height; i++, d += d_step, a += a_step, c += c_step )
    885                     {
    886                         float t0 = a[0]*b[0] + a[1]*b[b_step];
    887                         float t1 = a[0]*b[1] + a[1]*b[b_step+1];
    888                         d[0] = (float)(t0*alpha + c[0]*beta);
    889                         d[1] = (float)(t1*alpha + c[1]*beta);
    890                     }
    891                 }
    892                 else if( a != d )
    893                 {
    894                     int c_step0 = 1;
    895                     if( c == zerof )
    896                     {
    897                         c_step0 = 0;
    898                         c_step = 1;
    899                     }
    900 
    901                     for( i = 0; i < d_size.width; i++, d++, b++, c += c_step0 )
    902                     {
    903                         float t0 = a[0]*b[0] + a[1]*b[b_step];
    904                         float t1 = a[a_step]*b[0] + a[a_step+1]*b[b_step];
    905                         d[0] = (float)(t0*alpha + c[0]*beta);
    906                         d[d_step] = (float)(t1*alpha + c[c_step]*beta);
    907                     }
    908                 }
    909                 else
    910                     break;
    911                 EXIT;
    912             case 3:
    913                 if( len == d_size.width && b != d )
    914                 {
    915                     for( i = 0; i < d_size.height; i++, d += d_step, a += a_step, c += c_step )
    916                     {
    917                         float t0 = a[0]*b[0] + a[1]*b[b_step] + a[2]*b[b_step*2];
    918                         float t1 = a[0]*b[1] + a[1]*b[b_step+1] + a[2]*b[b_step*2+1];
    919                         float t2 = a[0]*b[2] + a[1]*b[b_step+2] + a[2]*b[b_step*2+2];
    920                         d[0] = (float)(t0*alpha + c[0]*beta);
    921                         d[1] = (float)(t1*alpha + c[1]*beta);
    922                         d[2] = (float)(t2*alpha + c[2]*beta);
    923                     }
    924                 }
    925                 else if( a != d )
    926                 {
    927                     int c_step0 = 1;
    928                     if( c == zerof )
    929                     {
    930                         c_step0 = 0;
    931                         c_step = 1;
    932                     }
    933 
    934                     for( i = 0; i < d_size.width; i++, d++, b++, c += c_step0 )
    935                     {
    936                         float t0 = a[0]*b[0] + a[1]*b[b_step] + a[2]*b[b_step*2];
    937                         float t1 = a[a_step]*b[0] + a[a_step+1]*b[b_step] + a[a_step+2]*b[b_step*2];
    938                         float t2 = a[a_step*2]*b[0] + a[a_step*2+1]*b[b_step] + a[a_step*2+2]*b[b_step*2];
    939 
    940                         d[0] = (float)(t0*alpha + c[0]*beta);
    941                         d[d_step] = (float)(t1*alpha + c[c_step]*beta);
    942                         d[d_step*2] = (float)(t2*alpha + c[c_step*2]*beta);
    943                     }
    944                 }
    945                 else
    946                     break;
    947                 EXIT;
    948             case 4:
    949                 if( len == d_size.width && b != d )
    950                 {
    951                     for( i = 0; i < d_size.height; i++, d += d_step, a += a_step, c += c_step )
    952                     {
    953                         float t0 = a[0]*b[0] + a[1]*b[b_step] + a[2]*b[b_step*2] + a[3]*b[b_step*3];
    954                         float t1 = a[0]*b[1] + a[1]*b[b_step+1] + a[2]*b[b_step*2+1] + a[3]*b[b_step*3+1];
    955                         float t2 = a[0]*b[2] + a[1]*b[b_step+2] + a[2]*b[b_step*2+2] + a[3]*b[b_step*3+2];
    956                         float t3 = a[0]*b[3] + a[1]*b[b_step+3] + a[2]*b[b_step*2+3] + a[3]*b[b_step*3+3];
    957                         d[0] = (float)(t0*alpha + c[0]*beta);
    958                         d[1] = (float)(t1*alpha + c[1]*beta);
    959                         d[2] = (float)(t2*alpha + c[2]*beta);
    960                         d[3] = (float)(t3*alpha + c[3]*beta);
    961                     }
    962                 }
    963                 else if( len <= 16 && a != d )
    964                 {
    965                     int c_step0 = 1;
    966                     if( c == zerof )
    967                     {
    968                         c_step0 = 0;
    969                         c_step = 1;
    970                     }
    971 
    972                     for( i = 0; i < d_size.width; i++, d++, b++, c += c_step0 )
    973                     {
    974                         float t0 = a[0]*b[0] + a[1]*b[b_step] + a[2]*b[b_step*2] + a[3]*b[b_step*3];
    975                         float t1 = a[a_step]*b[0] + a[a_step+1]*b[b_step] +
    976                                    a[a_step+2]*b[b_step*2] + a[a_step+3]*b[b_step*3];
    977                         float t2 = a[a_step*2]*b[0] + a[a_step*2+1]*b[b_step] +
    978                                    a[a_step*2+2]*b[b_step*2] + a[a_step*2+3]*b[b_step*3];
    979                         float t3 = a[a_step*3]*b[0] + a[a_step*3+1]*b[b_step] +
    980                                    a[a_step*3+2]*b[b_step*2] + a[a_step*3+3]*b[b_step*3];
    981                         d[0] = (float)(t0*alpha + c[0]*beta);
    982                         d[d_step] = (float)(t1*alpha + c[c_step]*beta);
    983                         d[d_step*2] = (float)(t2*alpha + c[c_step*2]*beta);
    984                         d[d_step*3] = (float)(t3*alpha + c[c_step*3]*beta);
    985                     }
    986                 }
    987                 else
    988                     break;
    989                 EXIT;
    990             }
    991         }
    992     }
    993 
    994     {
    995         int b_step = B->step;
    996         CvGEMMSingleMulFunc single_mul_func;
    997         CvMat tmat, *D0 = D;
    998         icvBLAS_GEMM_32f_t blas_func = 0;
    999 
   1000         if( !inittab )
   1001         {
   1002             icvInitGEMMTable( &single_mul_tab, &block_mul_tab, &store_tab );
   1003             inittab = 1;
   1004         }
   1005 
   1006         single_mul_func = (CvGEMMSingleMulFunc)single_mul_tab.fn_2d[type];
   1007         if( !single_mul_func )
   1008             CV_ERROR( CV_StsUnsupportedFormat, "" );
   1009 
   1010         if( D->data.ptr == A->data.ptr || D->data.ptr == B->data.ptr )
   1011         {
   1012             int buf_size = d_size.width*d_size.height*CV_ELEM_SIZE(type);
   1013             if( d_size.width <= CV_MAX_LOCAL_MAT_SIZE )
   1014             {
   1015                 buffer = (uchar*)cvStackAlloc( buf_size );
   1016                 local_alloc = 1;
   1017             }
   1018             else
   1019                 CV_CALL( buffer = (uchar*)cvAlloc( buf_size ));
   1020 
   1021             tmat = cvMat( d_size.height, d_size.width, type, buffer );
   1022             D = &tmat;
   1023         }
   1024 
   1025         if( (d_size.width == 1 || len == 1) && !(flags & CV_GEMM_B_T) && CV_IS_MAT_CONT(B->type) )
   1026         {
   1027             b_step = d_size.width == 1 ? 0 : CV_ELEM_SIZE(type);
   1028             flags |= CV_GEMM_B_T;
   1029         }
   1030 
   1031         if( (d_size.width | d_size.height | len) >= 16 && icvBLAS_GEMM_32f_p != 0 )
   1032         {
   1033             blas_func = type == CV_32FC1 ? (icvBLAS_GEMM_32f_t)icvBLAS_GEMM_32f_p :
   1034                         type == CV_64FC1 ? (icvBLAS_GEMM_32f_t)icvBLAS_GEMM_64f_p :
   1035                         type == CV_32FC2 ? (icvBLAS_GEMM_32f_t)icvBLAS_GEMM_32fc_p :
   1036                         type == CV_64FC2 ? (icvBLAS_GEMM_32f_t)icvBLAS_GEMM_64fc_p : 0;
   1037         }
   1038 
   1039         if( blas_func )
   1040         {
   1041             const char* transa = flags & CV_GEMM_A_T ? "t" : "n";
   1042             const char* transb = flags & CV_GEMM_B_T ? "t" : "n";
   1043             int lda, ldb, ldd;
   1044 
   1045             if( C->data.ptr )
   1046             {
   1047                 if( C->data.ptr != D->data.ptr )
   1048                 {
   1049                     if( !(flags & CV_GEMM_C_T) )
   1050                         cvCopy( C, D );
   1051                     else
   1052                         cvTranspose( C, D );
   1053                 }
   1054             }
   1055 
   1056             if( CV_MAT_DEPTH(type) == CV_32F )
   1057             {
   1058                 CvComplex32f _alpha, _beta;
   1059 
   1060                 lda = A->step/sizeof(float);
   1061                 ldb = b_step/sizeof(float);
   1062                 ldd = D->step/sizeof(float);
   1063                 _alpha.re = (float)alpha;
   1064                 _alpha.im = 0;
   1065                 _beta.re = C->data.ptr ? (float)beta : 0;
   1066                 _beta.im = 0;
   1067                 if( CV_MAT_CN(type) == 2 )
   1068                     lda /= 2, ldb /= 2, ldd /= 2;
   1069 
   1070                 blas_func( transb, transa, &d_size.width, &d_size.height, &len,
   1071                        &_alpha, B->data.ptr, &ldb, A->data.ptr, &lda,
   1072                        &_beta, D->data.ptr, &ldd );
   1073             }
   1074             else
   1075             {
   1076                 CvComplex64f _alpha, _beta;
   1077 
   1078                 lda = A->step/sizeof(double);
   1079                 ldb = b_step/sizeof(double);
   1080                 ldd = D->step/sizeof(double);
   1081                 _alpha.re = alpha;
   1082                 _alpha.im = 0;
   1083                 _beta.re = C->data.ptr ? beta : 0;
   1084                 _beta.im = 0;
   1085                 if( CV_MAT_CN(type) == 2 )
   1086                     lda /= 2, ldb /= 2, ldd /= 2;
   1087 
   1088                 blas_func( transb, transa, &d_size.width, &d_size.height, &len,
   1089                        &_alpha, B->data.ptr, &ldb, A->data.ptr, &lda,
   1090                        &_beta, D->data.ptr, &ldd );
   1091             }
   1092         }
   1093         else if( ((d_size.height <= block_lin_size/2 || d_size.width <= block_lin_size/2) &&
   1094             len <= 10000) || len <= 10 ||
   1095             (d_size.width <= block_lin_size && d_size.height <= block_lin_size && len <= block_lin_size) )
   1096         {
   1097             single_mul_func( A->data.ptr, A->step, B->data.ptr, b_step,
   1098                              C->data.ptr, C->step, D->data.ptr, D->step,
   1099                              a_size, d_size, alpha, beta, flags );
   1100         }
   1101         else
   1102         {
   1103             int is_a_t = flags & CV_GEMM_A_T;
   1104             int is_b_t = flags & CV_GEMM_B_T;
   1105             int elem_size = CV_ELEM_SIZE(type);
   1106             int dk0_1, dk0_2;
   1107             int a_buf_size = 0, b_buf_size, d_buf_size;
   1108             uchar* a_buf = 0;
   1109             uchar* b_buf = 0;
   1110             uchar* d_buf = 0;
   1111             int i, j, k, di = 0, dj = 0, dk = 0;
   1112             int dm0, dn0, dk0;
   1113             int a_step0, a_step1, b_step0, b_step1, c_step0, c_step1;
   1114             int work_elem_size = elem_size << (CV_MAT_DEPTH(type) == CV_32F ? 1 : 0);
   1115             CvGEMMBlockMulFunc block_mul_func = (CvGEMMBlockMulFunc)block_mul_tab.fn_2d[type];
   1116             CvGEMMStoreFunc store_func = (CvGEMMStoreFunc)store_tab.fn_2d[type];
   1117 
   1118             assert( block_mul_func && store_func );
   1119 
   1120             if( !is_a_t )
   1121                 a_step0 = A->step, a_step1 = elem_size;
   1122             else
   1123                 a_step0 = elem_size, a_step1 = A->step;
   1124 
   1125             if( !is_b_t )
   1126                 b_step0 = b_step, b_step1 = elem_size;
   1127             else
   1128                 b_step0 = elem_size, b_step1 = b_step;
   1129 
   1130             if( !C->data.ptr )
   1131             {
   1132                 c_step0 = c_step1 = 0;
   1133                 flags &= ~CV_GEMM_C_T;
   1134             }
   1135             else if( !(flags & CV_GEMM_C_T) )
   1136                 c_step0 = C->step, c_step1 = elem_size;
   1137             else
   1138                 c_step0 = elem_size, c_step1 = C->step;
   1139 
   1140             dm0 = MIN( block_lin_size, d_size.height );
   1141             dn0 = MIN( block_lin_size, d_size.width );
   1142             dk0_1 = block_size / dm0;
   1143             dk0_2 = block_size / dn0;
   1144             dk0 = MAX( dk0_1, dk0_2 );
   1145             dk0 = MIN( dk0, len );
   1146             if( dk0*dm0 > block_size )
   1147                 dm0 = block_size / dk0;
   1148             if( dk0*dn0 > block_size )
   1149                 dn0 = block_size / dk0;
   1150 
   1151             dk0_1 = (dn0+dn0/8+2) & -2;
   1152             b_buf_size = (dk0+dk0/8+1)*dk0_1*elem_size;
   1153             d_buf_size = (dk0+dk0/8+1)*dk0_1*work_elem_size;
   1154 
   1155             if( is_a_t )
   1156             {
   1157                 a_buf_size = (dm0+dm0/8+1)*((dk0+dk0/8+2)&-2)*elem_size;
   1158                 flags &= ~CV_GEMM_A_T;
   1159             }
   1160 
   1161             CV_CALL( block_buffer = (uchar*)cvAlloc(a_buf_size + b_buf_size + d_buf_size));
   1162             d_buf = block_buffer;
   1163             b_buf = d_buf + d_buf_size;
   1164 
   1165             if( is_a_t )
   1166                 a_buf = b_buf + b_buf_size;
   1167 
   1168             for( i = 0; i < d_size.height; i += di )
   1169             {
   1170                 di = dm0;
   1171                 if( i + di >= d_size.height || 8*(i + di) + di > 8*d_size.height )
   1172                     di = d_size.height - i;
   1173 
   1174                 for( j = 0; j < d_size.width; j += dj )
   1175                 {
   1176                     uchar* _d = D->data.ptr + i*D->step + j*elem_size;
   1177                     const uchar* _c = C->data.ptr + i*c_step0 + j*c_step1;
   1178                     int _d_step = D->step;
   1179                     dj = dn0;
   1180 
   1181                     if( j + dj >= d_size.width || 8*(j + dj) + dj > 8*d_size.width )
   1182                         dj = d_size.width - j;
   1183 
   1184                     flags &= 15;
   1185                     if( dk0 < len )
   1186                     {
   1187                         _d = d_buf;
   1188                         _d_step = dj*work_elem_size;
   1189                     }
   1190 
   1191                     for( k = 0; k < len; k += dk )
   1192                     {
   1193                         const uchar* _a = A->data.ptr + i*a_step0 + k*a_step1;
   1194                         int _a_step = A->step;
   1195                         const uchar* _b = B->data.ptr + k*b_step0 + j*b_step1;
   1196                         int _b_step = b_step;
   1197                         CvSize a_bl_size;
   1198 
   1199                         dk = dk0;
   1200                         if( k + dk >= len || 8*(k + dk) + dk > 8*len )
   1201                             dk = len - k;
   1202 
   1203                         if( !is_a_t )
   1204                             a_bl_size.width = dk, a_bl_size.height = di;
   1205                         else
   1206                             a_bl_size.width = di, a_bl_size.height = dk;
   1207 
   1208                         if( a_buf && is_a_t )
   1209                         {
   1210                             int t;
   1211                             _a_step = dk*elem_size;
   1212                             icvGEMM_TransposeBlock( _a, A->step, a_buf, _a_step, a_bl_size, elem_size );
   1213                             CV_SWAP( a_bl_size.width, a_bl_size.height, t );
   1214                             _a = a_buf;
   1215                         }
   1216 
   1217                         if( dj < d_size.width )
   1218                         {
   1219                             CvSize b_size;
   1220                             if( !is_b_t )
   1221                                 b_size.width = dj, b_size.height = dk;
   1222                             else
   1223                                 b_size.width = dk, b_size.height = dj;
   1224 
   1225                             _b_step = b_size.width*elem_size;
   1226                             icvGEMM_CopyBlock( _b, b_step, b_buf, _b_step, b_size, elem_size );
   1227                             _b = b_buf;
   1228                         }
   1229 
   1230                         if( dk0 < len )
   1231                             block_mul_func( _a, _a_step, _b, _b_step, _d, _d_step,
   1232                                             a_bl_size, cvSize(dj,di), flags );
   1233                         else
   1234                             single_mul_func( _a, _a_step, _b, _b_step, _c, C->step, _d, _d_step,
   1235                                              a_bl_size, cvSize(dj,di), alpha, beta, flags );
   1236                         flags |= 16;
   1237                     }
   1238 
   1239                     if( dk0 < len )
   1240                         store_func( _c, C->step, _d, _d_step, D->data.ptr + i*D->step + j*elem_size,
   1241                                     D->step, cvSize(dj,di), alpha, beta, flags );
   1242                 }
   1243             }
   1244         }
   1245 
   1246         if( D0 != D )
   1247             CV_CALL( cvCopy( D, D0 ));
   1248     }
   1249 
   1250     __END__;
   1251 
   1252     if( buffer && !local_alloc )
   1253         cvFree( &buffer );
   1254     if( block_buffer )
   1255         cvFree( &block_buffer );
   1256 }
   1257 
   1258 
   1259 /****************************************************************************************\
   1260 *                                        cvTransform                                     *
   1261 \****************************************************************************************/
   1262 
   1263 #define  ICV_DEF_TRANSFORM_CASE_C1( arrtype, temptype, _ld_,        \
   1264                                    _cast_macro1_, _cast_macro2_ )   \
   1265 {                                                                   \
   1266     for( i = 0; i < size.width; i++, dst += dst_cn )                \
   1267     {                                                               \
   1268         const double* _mat = mat;                                   \
   1269         double v0 = _ld_(src[i]);                                   \
   1270         for( k = 0; k < dst_cn; k++, _mat += 2 )                    \
   1271         {                                                           \
   1272             temptype t0 = _cast_macro1_(_mat[0]*v0 + _mat[1]);      \
   1273             dst[k] = _cast_macro2_(t0);                             \
   1274         }                                                           \
   1275     }                                                               \
   1276     src += size.width;                                              \
   1277 }
   1278 
   1279 
   1280 #define  ICV_DEF_DIAG_TRANSFORM_CASE_C1( arrtype, temptype, _ld_,   \
   1281                                   _cast_macro1_, _cast_macro2_ )    \
   1282     for( i = 0; i < size.width; i++ )                               \
   1283     {                                                               \
   1284         double ft0;                                                 \
   1285         temptype t0;                                                \
   1286         ft0 = mat[0]*_ld_(src[i]) + mat[1];                         \
   1287         t0 = _cast_macro1_(ft0);                                    \
   1288         dst[i] = _cast_macro2_(t0);                                 \
   1289     }
   1290 
   1291 
   1292 #define  ICV_DEF_TRANSFORM_CASE_C2( arrtype, temptype, _ld_,        \
   1293                                   _cast_macro1_, _cast_macro2_ )    \
   1294 if( dst_cn == 2 )                                                   \
   1295 {                                                                   \
   1296     for( i = 0; i < size.width*2; i += 2 )                          \
   1297     {                                                               \
   1298         double ft0, ft1;                                            \
   1299         temptype t0, t1;                                            \
   1300         ft0 = mat[0]*_ld_(src[i]) + mat[1]*_ld_(src[i+1]) + mat[2]; \
   1301         ft1 = mat[3]*_ld_(src[i]) + mat[4]*_ld_(src[i+1]) + mat[5]; \
   1302         t0 = _cast_macro1_(ft0);                                    \
   1303         t1 = _cast_macro1_(ft1);                                    \
   1304         dst[i] = _cast_macro2_(t0);                                 \
   1305         dst[i+1] = _cast_macro2_(t1);                               \
   1306     }                                                               \
   1307     src += size.width*2; dst += size.width*2;                       \
   1308 }                                                                   \
   1309 else                                                                \
   1310     for( i = 0; i < size.width; i++, src += 2, dst += dst_cn )      \
   1311     {                                                               \
   1312         const double* _mat = mat;                                   \
   1313         double v0 = _ld_(src[0]), v1 = src[1];                      \
   1314         for( k = 0; k < dst_cn; k++, _mat += 3 )                    \
   1315         {                                                           \
   1316             temptype t0 =                                           \
   1317                 _cast_macro1_(_mat[0]*v0 + _mat[1]*v1 + _mat[2]);   \
   1318             dst[k] = _cast_macro2_(t0);                             \
   1319         }                                                           \
   1320     }
   1321 
   1322 
   1323 #define  ICV_DEF_DIAG_TRANSFORM_CASE_C2( arrtype, temptype, _ld_,   \
   1324                                   _cast_macro1_, _cast_macro2_ )    \
   1325     for( i = 0; i < size.width*2; i += 2 )                          \
   1326     {                                                               \
   1327         double ft0, ft1;                                            \
   1328         temptype t0, t1;                                            \
   1329         ft0 = mat[0]*_ld_(src[i]) + mat[2];                         \
   1330         ft1 = mat[4]*_ld_(src[i+1]) + mat[5];                       \
   1331         t0 = _cast_macro1_(ft0);                                    \
   1332         t1 = _cast_macro1_(ft1);                                    \
   1333         dst[i] = _cast_macro2_(t0);                                 \
   1334         dst[i+1] = _cast_macro2_(t1);                               \
   1335     }
   1336 
   1337 
   1338 #define  ICV_DEF_TRANSFORM_CASE_C3( arrtype, temptype, _ld_,        \
   1339                                   _cast_macro1_, _cast_macro2_ )    \
   1340 if( dst_cn == 3 )                                                   \
   1341 {                                                                   \
   1342     for( i = 0; i < size.width*3; i += 3 )                          \
   1343     {                                                               \
   1344         double ft0, ft1, ft2;                                       \
   1345         temptype t0, t1, t2;                                        \
   1346         ft0 = mat[0]*_ld_(src[i]) + mat[1]*_ld_(src[i+1]) +         \
   1347               mat[2]*_ld_(src[i+2]) + mat[3];                       \
   1348         ft1 = mat[4]*_ld_(src[i]) + mat[5]*_ld_(src[i+1]) +         \
   1349               mat[6]*_ld_(src[i+2]) + mat[7];                       \
   1350         ft2 = mat[8]*_ld_(src[i]) + mat[9]*_ld_(src[i+1]) +         \
   1351               mat[10]*_ld_(src[i+2]) + mat[11];                     \
   1352         t0 = _cast_macro1_(ft0);                                    \
   1353         t1 = _cast_macro1_(ft1);                                    \
   1354         t2 = _cast_macro1_(ft2);                                    \
   1355         dst[i] = _cast_macro2_(t0);                                 \
   1356         dst[i+1] = _cast_macro2_(t1);                               \
   1357         dst[i+2] = _cast_macro2_(t2);                               \
   1358     }                                                               \
   1359     src += size.width*3; dst += size.width*3;                       \
   1360 }                                                                   \
   1361 else if( dst_cn == 1 )                                              \
   1362 {                                                                   \
   1363     for( i = 0; i < size.width; i++, src += 3 )                     \
   1364     {                                                               \
   1365         temptype t0 = _cast_macro1_(mat[0]*_ld_(src[0]) +           \
   1366             mat[1]*_ld_(src[1]) + mat[2]*_ld_(src[2]) + mat[3]);    \
   1367         dst[i] = _cast_macro2_(t0);                                 \
   1368     }                                                               \
   1369     dst += size.width;                                              \
   1370 }                                                                   \
   1371 else                                                                \
   1372     for( i = 0; i < size.width; i++, src += 3, dst += dst_cn )      \
   1373     {                                                               \
   1374         const double* _mat = mat;                                   \
   1375         double v0=_ld_(src[0]), v1=_ld_(src[1]), v2=_ld_(src[2]);   \
   1376         for( k = 0; k < dst_cn; k++, _mat += 4 )                    \
   1377         {                                                           \
   1378             temptype t0 = _cast_macro1_(_mat[0]*v0 +                \
   1379                     _mat[1]*v1 + _mat[2]*v2 + _mat[3]);             \
   1380             dst[k] = _cast_macro2_(t0);                             \
   1381         }                                                           \
   1382     }
   1383 
   1384 
   1385 #define  ICV_DEF_DIAG_TRANSFORM_CASE_C3( arrtype, temptype, _ld_,   \
   1386                                   _cast_macro1_, _cast_macro2_ )    \
   1387     for( i = 0; i < size.width*3; i += 3 )                          \
   1388     {                                                               \
   1389         double ft0, ft1, ft2;                                       \
   1390         temptype t0, t1, t2;                                        \
   1391         ft0 = mat[0]*_ld_(src[i]) + mat[3];                         \
   1392         ft1 = mat[5]*_ld_(src[i+1]) + mat[7];                       \
   1393         ft2 = mat[10]*_ld_(src[i+2]) + mat[11];                     \
   1394         t0 = _cast_macro1_(ft0);                                    \
   1395         t1 = _cast_macro1_(ft1);                                    \
   1396         t2 = _cast_macro1_(ft2);                                    \
   1397         dst[i] = _cast_macro2_(t0);                                 \
   1398         dst[i+1] = _cast_macro2_(t1);                               \
   1399         dst[i+2] = _cast_macro2_(t2);                               \
   1400     }
   1401 
   1402 
   1403 #define  ICV_DEF_TRANSFORM_CASE_C4( arrtype, temptype, _ld_,        \
   1404                                   _cast_macro1_, _cast_macro2_ )    \
   1405 for( i = 0; i < size.width; i++, src += 4, dst += dst_cn )          \
   1406 {                                                                   \
   1407     const double* _mat = mat;                                       \
   1408     double v0 = _ld_(src[0]), v1 = _ld_(src[1]),                    \
   1409            v2 = _ld_(src[2]), v3 = _ld_(src[3]);                    \
   1410     for( k = 0; k < dst_cn; k++, _mat += 5 )                        \
   1411     {                                                               \
   1412         temptype t0 =_cast_macro1_(_mat[0]*v0+_mat[1]*v1+           \
   1413                                    _mat[2]*v2+_mat[3]*v3+_mat[4]);  \
   1414         dst[k] = _cast_macro2_(t0);                                 \
   1415     }                                                               \
   1416 }
   1417 
   1418 
   1419 #define  ICV_DEF_DIAG_TRANSFORM_CASE_C4( arrtype, temptype, _ld_,   \
   1420                                   _cast_macro1_, _cast_macro2_ )    \
   1421     for( i = 0; i < size.width*4; i += 4 )                          \
   1422     {                                                               \
   1423         double ft0, ft1;                                            \
   1424         temptype t0, t1;                                            \
   1425         ft0 = mat[0]*_ld_(src[i]) + mat[4];                         \
   1426         ft1 = mat[6]*_ld_(src[i+1]) + mat[9];                       \
   1427         t0 = _cast_macro1_(ft0);                                    \
   1428         t1 = _cast_macro1_(ft1);                                    \
   1429         dst[i] = _cast_macro2_(t0);                                 \
   1430         dst[i+1] = _cast_macro2_(t1);                               \
   1431         ft0 = mat[12]*_ld_(src[i+2]) + mat[14];                     \
   1432         ft1 = mat[18]*_ld_(src[i+3]) + mat[19];                     \
   1433         t0 = _cast_macro1_(ft0);                                    \
   1434         t1 = _cast_macro1_(ft1);                                    \
   1435         dst[i+2] = _cast_macro2_(t0);                               \
   1436         dst[i+3] = _cast_macro2_(t1);                               \
   1437     }
   1438 
   1439 
   1440 
   1441 #define  ICV_DEF_TRANSFORM_FUNC( flavor, arrtype, temptype, _ld_,   \
   1442                                  _cast_macro1_, _cast_macro2_, cn  )\
   1443 static CvStatus CV_STDCALL                                          \
   1444 icvTransform_##flavor( const arrtype* src, int srcstep,             \
   1445                        arrtype* dst, int dststep, CvSize size,      \
   1446                        const double* mat, int dst_cn )              \
   1447 {                                                                   \
   1448     srcstep = srcstep/sizeof(src[0]) - size.width*cn;               \
   1449     dststep = dststep/sizeof(dst[0]) - size.width*dst_cn;           \
   1450     for( ; size.height--; src += srcstep, dst += dststep )          \
   1451     {                                                               \
   1452         int i, k;                                                   \
   1453         ICV_DEF_TRANSFORM_CASE_C##cn( arrtype, temptype, _ld_,      \
   1454                                      _cast_macro1_, _cast_macro2_ ) \
   1455     }                                                               \
   1456                                                                     \
   1457     return CV_OK;                                                   \
   1458 }
   1459 
   1460 
   1461 #define  ICV_DEF_DIAG_TRANSFORM_FUNC( flavor, arrtype, temptype, _ld_, \
   1462                                  _cast_macro1_, _cast_macro2_, cn  )\
   1463 static CvStatus CV_STDCALL                                          \
   1464 icvDiagTransform_##flavor( const arrtype* src, int srcstep,         \
   1465                        arrtype* dst, int dststep, CvSize size,      \
   1466                        const double* mat )                          \
   1467 {                                                                   \
   1468     srcstep /= sizeof(src[0]);                                      \
   1469     dststep /= sizeof(dst[0]);                                      \
   1470     for( ; size.height--; src += srcstep, dst += dststep )          \
   1471     {                                                               \
   1472         int i;                                                      \
   1473         ICV_DEF_DIAG_TRANSFORM_CASE_C##cn( arrtype, temptype, _ld_, \
   1474                                      _cast_macro1_, _cast_macro2_ ) \
   1475     }                                                               \
   1476                                                                     \
   1477     return CV_OK;                                                   \
   1478 }
   1479 
   1480 
   1481 ICV_DEF_TRANSFORM_FUNC( 8u_C1R, uchar, int, CV_8TO32F, cvRound, CV_CAST_8U, 1 )
   1482 ICV_DEF_TRANSFORM_FUNC( 8u_C2R, uchar, int, CV_8TO32F, cvRound, CV_CAST_8U, 2 )
   1483 ICV_DEF_TRANSFORM_FUNC( 8u_C3R, uchar, int, CV_8TO32F, cvRound, CV_CAST_8U, 3 )
   1484 ICV_DEF_TRANSFORM_FUNC( 8u_C4R, uchar, int, CV_8TO32F, cvRound, CV_CAST_8U, 4 )
   1485 
   1486 ICV_DEF_TRANSFORM_FUNC( 16u_C1R, ushort, int, CV_NOP, cvRound, CV_CAST_16U, 1 )
   1487 ICV_DEF_TRANSFORM_FUNC( 16u_C2R, ushort, int, CV_NOP, cvRound, CV_CAST_16U, 2 )
   1488 ICV_DEF_TRANSFORM_FUNC( 16u_C3R, ushort, int, CV_NOP, cvRound, CV_CAST_16U, 3 )
   1489 ICV_DEF_TRANSFORM_FUNC( 16u_C4R, ushort, int, CV_NOP, cvRound, CV_CAST_16U, 4 )
   1490 
   1491 ICV_DEF_TRANSFORM_FUNC( 16s_C1R, short, int, CV_NOP, cvRound, CV_CAST_16S, 1 )
   1492 ICV_DEF_TRANSFORM_FUNC( 16s_C2R, short, int, CV_NOP, cvRound, CV_CAST_16S, 2 )
   1493 ICV_DEF_TRANSFORM_FUNC( 16s_C3R, short, int, CV_NOP, cvRound, CV_CAST_16S, 3 )
   1494 ICV_DEF_TRANSFORM_FUNC( 16s_C4R, short, int, CV_NOP, cvRound, CV_CAST_16S, 4 )
   1495 
   1496 ICV_DEF_TRANSFORM_FUNC( 32s_C1R, int, int, CV_NOP, cvRound, CV_NOP, 1 )
   1497 ICV_DEF_TRANSFORM_FUNC( 32s_C2R, int, int, CV_NOP, cvRound, CV_NOP, 2 )
   1498 ICV_DEF_TRANSFORM_FUNC( 32s_C3R, int, int, CV_NOP, cvRound, CV_NOP, 3 )
   1499 ICV_DEF_TRANSFORM_FUNC( 32s_C4R, int, int, CV_NOP, cvRound, CV_NOP, 4 )
   1500 
   1501 ICV_DEF_TRANSFORM_FUNC( 32f_C1R, float, double, CV_NOP, CV_NOP, CV_CAST_32F, 1 )
   1502 ICV_DEF_TRANSFORM_FUNC( 32f_C2R, float, double, CV_NOP, CV_NOP, CV_CAST_32F, 2 )
   1503 ICV_DEF_TRANSFORM_FUNC( 32f_C3R, float, double, CV_NOP, CV_NOP, CV_CAST_32F, 3 )
   1504 ICV_DEF_TRANSFORM_FUNC( 32f_C4R, float, double, CV_NOP, CV_NOP, CV_CAST_32F, 4 )
   1505 
   1506 ICV_DEF_TRANSFORM_FUNC( 64f_C1R, double, double, CV_NOP, CV_NOP, CV_CAST_64F, 1 )
   1507 ICV_DEF_TRANSFORM_FUNC( 64f_C2R, double, double, CV_NOP, CV_NOP, CV_CAST_64F, 2 )
   1508 ICV_DEF_TRANSFORM_FUNC( 64f_C3R, double, double, CV_NOP, CV_NOP, CV_CAST_64F, 3 )
   1509 ICV_DEF_TRANSFORM_FUNC( 64f_C4R, double, double, CV_NOP, CV_NOP, CV_CAST_64F, 4 )
   1510 
   1511 ICV_DEF_DIAG_TRANSFORM_FUNC( 16u_C1R, ushort, int, CV_NOP, cvRound, CV_CAST_16U, 1 )
   1512 ICV_DEF_DIAG_TRANSFORM_FUNC( 16u_C2R, ushort, int, CV_NOP, cvRound, CV_CAST_16U, 2 )
   1513 ICV_DEF_DIAG_TRANSFORM_FUNC( 16u_C3R, ushort, int, CV_NOP, cvRound, CV_CAST_16U, 3 )
   1514 ICV_DEF_DIAG_TRANSFORM_FUNC( 16u_C4R, ushort, int, CV_NOP, cvRound, CV_CAST_16U, 4 )
   1515 
   1516 ICV_DEF_DIAG_TRANSFORM_FUNC( 16s_C1R, short, int, CV_NOP, cvRound, CV_CAST_16S, 1 )
   1517 ICV_DEF_DIAG_TRANSFORM_FUNC( 16s_C2R, short, int, CV_NOP, cvRound, CV_CAST_16S, 2 )
   1518 ICV_DEF_DIAG_TRANSFORM_FUNC( 16s_C3R, short, int, CV_NOP, cvRound, CV_CAST_16S, 3 )
   1519 ICV_DEF_DIAG_TRANSFORM_FUNC( 16s_C4R, short, int, CV_NOP, cvRound, CV_CAST_16S, 4 )
   1520 
   1521 ICV_DEF_DIAG_TRANSFORM_FUNC( 32s_C1R, int, int, CV_NOP, cvRound, CV_NOP, 1 )
   1522 ICV_DEF_DIAG_TRANSFORM_FUNC( 32s_C2R, int, int, CV_NOP, cvRound, CV_NOP, 2 )
   1523 ICV_DEF_DIAG_TRANSFORM_FUNC( 32s_C3R, int, int, CV_NOP, cvRound, CV_NOP, 3 )
   1524 ICV_DEF_DIAG_TRANSFORM_FUNC( 32s_C4R, int, int, CV_NOP, cvRound, CV_NOP, 4 )
   1525 
   1526 ICV_DEF_DIAG_TRANSFORM_FUNC( 32f_C1R, float, double, CV_NOP, CV_NOP, CV_CAST_32F, 1 )
   1527 ICV_DEF_DIAG_TRANSFORM_FUNC( 32f_C2R, float, double, CV_NOP, CV_NOP, CV_CAST_32F, 2 )
   1528 ICV_DEF_DIAG_TRANSFORM_FUNC( 32f_C3R, float, double, CV_NOP, CV_NOP, CV_CAST_32F, 3 )
   1529 ICV_DEF_DIAG_TRANSFORM_FUNC( 32f_C4R, float, double, CV_NOP, CV_NOP, CV_CAST_32F, 4 )
   1530 
   1531 ICV_DEF_DIAG_TRANSFORM_FUNC( 64f_C1R, double, double, CV_NOP, CV_NOP, CV_CAST_64F, 1 )
   1532 ICV_DEF_DIAG_TRANSFORM_FUNC( 64f_C2R, double, double, CV_NOP, CV_NOP, CV_CAST_64F, 2 )
   1533 ICV_DEF_DIAG_TRANSFORM_FUNC( 64f_C3R, double, double, CV_NOP, CV_NOP, CV_CAST_64F, 3 )
   1534 ICV_DEF_DIAG_TRANSFORM_FUNC( 64f_C4R, double, double, CV_NOP, CV_NOP, CV_CAST_64F, 4 )
   1535 
   1536 #define icvTransform_8s_C1R 0
   1537 #define icvTransform_8s_C2R 0
   1538 #define icvTransform_8s_C3R 0
   1539 #define icvTransform_8s_C4R 0
   1540 
   1541 #define icvDiagTransform_8s_C1R 0
   1542 #define icvDiagTransform_8s_C2R 0
   1543 #define icvDiagTransform_8s_C3R 0
   1544 #define icvDiagTransform_8s_C4R 0
   1545 
   1546 #define icvDiagTransform_8u_C1R 0
   1547 #define icvDiagTransform_8u_C2R 0
   1548 #define icvDiagTransform_8u_C3R 0
   1549 #define icvDiagTransform_8u_C4R 0
   1550 
   1551 CV_DEF_INIT_BIG_FUNC_TAB_2D( Transform, R )
   1552 CV_DEF_INIT_BIG_FUNC_TAB_2D( DiagTransform, R )
   1553 
   1554 typedef CvStatus (CV_STDCALL * CvTransformFunc)(
   1555                        const void* src, int srcstep,
   1556                        void* dst, int dststep, CvSize size,
   1557                        const void* mat, int dst_cn );
   1558 
   1559 typedef CvStatus (CV_STDCALL * CvDiagTransformFunc)(
   1560                        const void* src, int srcstep,
   1561                        void* dst, int dststep, CvSize size,
   1562                        const void* mat );
   1563 
   1564 typedef CvStatus (CV_STDCALL * CvDiagTransformFunc)(
   1565                        const void* src, int srcstep,
   1566                        void* dst, int dststep, CvSize size,
   1567                        const void* mat );
   1568 
   1569 ///////////////////// IPP transform functions //////////////////
   1570 
   1571 icvColorTwist_8u_C3R_t icvColorTwist_8u_C3R_p = 0;
   1572 icvColorTwist_16u_C3R_t icvColorTwist_16u_C3R_p = 0;
   1573 icvColorTwist_16s_C3R_t icvColorTwist_16s_C3R_p = 0;
   1574 icvColorTwist_32f_C3R_t icvColorTwist_32f_C3R_p = 0;
   1575 icvColorTwist_32f_C4R_t icvColorTwist_32f_C4R_p = 0;
   1576 
   1577 icvColorToGray_8u_C3C1R_t icvColorToGray_8u_C3C1R_p = 0;
   1578 icvColorToGray_16u_C3C1R_t icvColorToGray_16u_C3C1R_p = 0;
   1579 icvColorToGray_16s_C3C1R_t icvColorToGray_16s_C3C1R_p = 0;
   1580 icvColorToGray_32f_C3C1R_t icvColorToGray_32f_C3C1R_p = 0;
   1581 
   1582 icvColorToGray_8u_AC4C1R_t icvColorToGray_8u_AC4C1R_p = 0;
   1583 icvColorToGray_16u_AC4C1R_t icvColorToGray_16u_AC4C1R_p = 0;
   1584 icvColorToGray_16s_AC4C1R_t icvColorToGray_16s_AC4C1R_p = 0;
   1585 icvColorToGray_32f_AC4C1R_t icvColorToGray_32f_AC4C1R_p = 0;
   1586 
   1587 typedef CvStatus (CV_STDCALL * CvColorTwistIPPFunc)( const void* src, int srcstep,
   1588                         void* dst, int dststep, CvSize size, const float* coeffs );
   1589 
   1590 ////////////////////////////////////////////////////////////////
   1591 
   1592 CV_IMPL void
   1593 cvTransform( const CvArr* srcarr, CvArr* dstarr,
   1594              const CvMat* transmat, const CvMat* shiftvec )
   1595 {
   1596     static CvBigFuncTable transform_tab, diag_transform_tab;
   1597     static int inittab = 0;
   1598     CvMat* lut = 0;
   1599 
   1600     CV_FUNCNAME( "cvTransform" );
   1601 
   1602     __BEGIN__;
   1603 
   1604     CvMat srcstub, *src = (CvMat*)srcarr;
   1605     CvMat dststub, *dst = (CvMat*)dstarr;
   1606     CvMat rotstub, *rot = (CvMat*)transmat;
   1607     CvMat shiftstub, *shift = (CvMat*)shiftvec;
   1608     CvSeq *src_seq = 0, *dst_seq = 0;
   1609     CvSeq hdr; // need only one copy of stub header & seqblock (either for src or dst)
   1610     CvSeqBlock block_hdr;
   1611     int i, j, type, cn, dst_cn;
   1612     int coi = 0, coi2 = 0;
   1613     double* buffer = (double*)cvStackAlloc( CV_CN_MAX*(CV_CN_MAX+1)*sizeof(buffer[0]) );
   1614 
   1615     if( !inittab )
   1616     {
   1617         icvInitTransformRTable( &transform_tab );
   1618         icvInitDiagTransformRTable( &diag_transform_tab );
   1619         inittab = 1;
   1620     }
   1621 
   1622     if( CV_IS_SEQ( src ))
   1623     {
   1624         src_seq = (CvSeq*)src;
   1625         if( CV_ELEM_SIZE(src_seq->flags) != src_seq->elem_size )
   1626             CV_ERROR( CV_StsUnsupportedFormat, "Unsupported type of sequence elements" );
   1627     }
   1628     else
   1629         CV_CALL( src = cvGetMat( src, &srcstub, &coi ));
   1630 
   1631     if( CV_IS_SEQ( dst ))
   1632     {
   1633         dst_seq = (CvSeq*)dst;
   1634         if( CV_ELEM_SIZE(dst_seq->flags) != dst_seq->elem_size )
   1635             CV_ERROR( CV_StsUnsupportedFormat, "Unsupported type of sequence elements" );
   1636     }
   1637     else
   1638         CV_CALL( dst = cvGetMat( dst, &dststub, &coi2 ));
   1639 
   1640     if( coi != 0 || coi2 != 0 )
   1641         CV_ERROR( CV_BadCOI, "" );
   1642 
   1643     if( !CV_ARE_DEPTHS_EQ(src, dst) )
   1644         CV_ERROR( CV_StsUnmatchedFormats, "" );
   1645 
   1646     if( src_seq || dst_seq )
   1647     {
   1648         if( !src_seq )
   1649         {
   1650             if( CV_IS_MAT_CONT(src->type) || (src->rows != 1 && src->cols != 1) )
   1651                 CV_ERROR( CV_StsBadSize, "if eigher the source or destination is a sequence, "
   1652                 "the other array must be also a sequence of continous 1d vector" );
   1653             src_seq = cvMakeSeqHeaderForArray( CV_MAT_TYPE(src->type), sizeof(hdr),
   1654                                        CV_ELEM_SIZE(src->type), src->data.ptr,
   1655                                        src->rows + src->cols + 1, &hdr, &block_hdr );
   1656         }
   1657 
   1658         if( !dst_seq )
   1659         {
   1660             if( CV_IS_MAT_CONT(dst->type) || (dst->rows != 1 && dst->cols != 1) )
   1661                 CV_ERROR( CV_StsBadSize, "if eigher the source or destination is a sequence, "
   1662                 "the other array must be also a sequence of continous 1d vector" );
   1663             if( dst->rows + dst->cols - 1 != src_seq->total )
   1664                 CV_ERROR( CV_StsUnmatchedFormats,
   1665                 "source sequence and destination vector have different sizes" );
   1666             dst_seq = cvMakeSeqHeaderForArray( CV_MAT_TYPE(dst->type), sizeof(hdr),
   1667                                            CV_ELEM_SIZE(dst->type), dst->data.ptr,
   1668                                            dst->rows + dst->cols + 1, &hdr, &block_hdr );
   1669         }
   1670         else if( dst_seq->total != src_seq->total )
   1671         {
   1672             if( dst_seq->total > src_seq->total )
   1673                 cvSeqPopMulti( dst_seq, 0, dst_seq->total - src_seq->total );
   1674             else
   1675                 cvSeqPushMulti( dst_seq, 0, src_seq->total - dst_seq->total );
   1676         }
   1677     }
   1678     else if( !CV_ARE_SIZES_EQ( src, dst ))
   1679         CV_ERROR( CV_StsUnmatchedSizes, "" );
   1680 
   1681     type = CV_MAT_TYPE( src->type );
   1682     cn = CV_MAT_CN( type );
   1683     dst_cn = CV_MAT_CN( dst->type );
   1684 
   1685     if( cn > 4 || dst_cn > 4 )
   1686         CV_ERROR( CV_StsOutOfRange, "Both input and output array must have at most 4 channels" );
   1687 
   1688     if( !CV_IS_MAT( rot ))
   1689         CV_CALL( rot = cvGetMat( rot, &rotstub, &coi ));
   1690 
   1691     if( rot->rows != dst_cn )
   1692         CV_ERROR( CV_StsBadSize,
   1693         "The height of transmat matrix must be equal to number of channels" );
   1694 
   1695     if( rot->cols == cn + 1 || rot->cols == cn )
   1696     {
   1697         if( CV_MAT_TYPE( rot->type ) == CV_64FC1 )
   1698         {
   1699             for( i = 0; i < dst_cn; i++ )
   1700             {
   1701                 buffer[i*(cn+1) + cn] = 0;
   1702                 for( j = 0; j < rot->cols; j++ )
   1703                     buffer[i*(cn+1) + j] = ((double*)(rot->data.ptr + rot->step*i))[j];
   1704             }
   1705         }
   1706         else if( CV_MAT_TYPE( rot->type ) == CV_32FC1 )
   1707         {
   1708             for( i = 0; i < dst_cn; i++ )
   1709             {
   1710                 buffer[i*(cn+1) + cn] = 0;
   1711                 for( j = 0; j < rot->cols; j++ )
   1712                     buffer[i*(cn+1) + j] = ((float*)(rot->data.ptr + rot->step*i))[j];
   1713             }
   1714         }
   1715         else
   1716             CV_ERROR( CV_StsUnsupportedFormat, "Rotation matrix must be 32fC1 or 64fC1" );
   1717     }
   1718     else
   1719         CV_ERROR( CV_StsUnmatchedSizes, "If the source array has <cn> channels, "
   1720            "the transformation matrix must have <cn> x <cn>+1 or <cn> x <cn> size" );
   1721 
   1722     if( shift )
   1723     {
   1724         if( !CV_IS_MAT( shift ))
   1725             CV_CALL( shift = cvGetMat( shift, &shiftstub, &coi ));
   1726 
   1727         if( CV_MAT_CN( shift->type ) * shift->cols * shift->rows == dst_cn &&
   1728             (shift->rows == 1 || shift->cols == 1) )
   1729         {
   1730             if( CV_MAT_DEPTH( shift->type ) == CV_64F )
   1731             {
   1732                 int step = shift->step ? shift->step/sizeof(double) : 1;
   1733                 for( i = 0; i < dst_cn; i++ )
   1734                     buffer[i*(cn+1) + cn] += shift->data.db[i*step];
   1735             }
   1736             else if( CV_MAT_DEPTH( shift->type ) == CV_32F )
   1737             {
   1738                 int step = shift->step ? shift->step/sizeof(float) : 1;
   1739                 for( i = 0; i < dst_cn; i++ )
   1740                     buffer[i*(cn+1) + cn] += shift->data.fl[i*step];
   1741             }
   1742             else
   1743                 CV_ERROR( CV_StsUnsupportedFormat, "Shift vector must be 32f or 64f" );
   1744         }
   1745         else
   1746         {
   1747             CV_ERROR( CV_StsUnmatchedSizes,
   1748                 "Shift (if present) must be 1 dimensional vector with the number "
   1749                 "of elements equal to number of channels in the processed array" );
   1750         }
   1751     }
   1752 
   1753     if( coi != 0 || coi2 != 0 )
   1754         CV_ERROR( CV_BadCOI, "" );
   1755 
   1756     {
   1757         CvTransformFunc func = (CvTransformFunc)(transform_tab.fn_2d[type]);
   1758         CvDiagTransformFunc diag_func = 0;
   1759         CvLUT_TransformFunc lut_func = 0;
   1760         int diag_transform = 0;
   1761         CvColorTwistIPPFunc ipp_func = 0;
   1762         CvSize size;
   1763         float* ipp_coeffs = (float*)cvStackAlloc( 16*sizeof(ipp_coeffs[0]) );
   1764 
   1765         if( !func )
   1766             CV_ERROR( CV_StsUnsupportedFormat, "" );
   1767 
   1768         if( cn == dst_cn )
   1769             ipp_func = type == CV_8UC3 ? icvColorTwist_8u_C3R_p :
   1770                        type == CV_16UC3 ? icvColorTwist_16u_C3R_p :
   1771                        type == CV_16SC3 ? icvColorTwist_16s_C3R_p :
   1772                        type == CV_32FC3 ? icvColorTwist_32f_C3R_p :
   1773                        type == CV_32FC4 && fabs(buffer[4]) < DBL_EPSILON &&
   1774                        fabs(buffer[9]) < DBL_EPSILON && fabs(buffer[14]) < DBL_EPSILON &&
   1775                        fabs(buffer[19]) < DBL_EPSILON ? icvColorTwist_32f_C4R_p : 0;
   1776         else if( dst_cn == 1 && (cn == 3 || cn == 4) &&
   1777                  buffer[0] >= 0 && buffer[1] >= 0 && buffer[2] >= 0 &&
   1778                  buffer[0] + buffer[1] + buffer[2] <= 1.01 &&
   1779                  fabs(buffer[3]) < DBL_EPSILON && (cn == 3 || fabs(buffer[4]) < DBL_EPSILON) )
   1780         {
   1781             if( cn == 3 )
   1782                 ipp_func = type == CV_8UC3 ? icvColorToGray_8u_C3C1R_p :
   1783                            type == CV_16UC3 ? icvColorToGray_16u_C3C1R_p :
   1784                            type == CV_16SC3 ? icvColorToGray_16s_C3C1R_p :
   1785                            type == CV_32FC3 ? icvColorToGray_32f_C3C1R_p : 0;
   1786             else
   1787                 ipp_func = type == CV_8UC4 ? icvColorToGray_8u_AC4C1R_p :
   1788                            type == CV_16UC4 ? icvColorToGray_16u_AC4C1R_p :
   1789                            type == CV_16SC4 ? icvColorToGray_16s_AC4C1R_p :
   1790                            type == CV_32FC4 ? icvColorToGray_32f_AC4C1R_p : 0;
   1791         }
   1792 
   1793         if( dst_cn == cn )
   1794         {
   1795             diag_transform = 1;
   1796             for( i = 0; i < dst_cn; i++ )
   1797                 for( j = 0; j < cn; j++ )
   1798                 {
   1799                     if( i != j && fabs(buffer[i*(cn+1) + j]) > DBL_EPSILON )
   1800                     {
   1801                         diag_transform = 0;
   1802                         break;
   1803                     }
   1804                 }
   1805 
   1806             if( diag_transform )
   1807             {
   1808                 if( CV_MAT_DEPTH(type) == CV_8U )
   1809                 {
   1810                     CV_CALL( lut = cvCreateMat( 1, 256, type ));
   1811                     for( i = 0; i < cn; i++ )
   1812                     {
   1813                         double a = buffer[i*(cn+1) + i], b = buffer[i*(cn+1) + cn];
   1814                         uchar* ltab = lut->data.ptr;
   1815                         for( j = 0; j < 256; j++ )
   1816                         {
   1817                             int t = cvRound(a*j + b);
   1818                             ltab[j*cn + i] = CV_CAST_8U(t);
   1819                         }
   1820                     }
   1821                     lut_func = cn == 1 ? (CvLUT_TransformFunc)icvLUT_Transform8u_8u_C1R :
   1822                                cn == 2 ? (CvLUT_TransformFunc)icvLUT_Transform8u_8u_C2R :
   1823                                cn == 3 ? (CvLUT_TransformFunc)icvLUT_Transform8u_8u_C3R :
   1824                                (CvLUT_TransformFunc)icvLUT_Transform8u_8u_C4R;
   1825                 }
   1826                 else
   1827                     diag_func = (CvDiagTransformFunc)(diag_transform_tab.fn_2d[type]);
   1828             }
   1829         }
   1830 
   1831         if( ipp_func )
   1832         {
   1833             const double* ptr = buffer;
   1834 
   1835             // fill cn x 4 ipp_coeffs array
   1836             for( i = 0; i < cn*4; i += 4, ptr += cn+1 )
   1837             {
   1838                 float t0 = (float)ptr[0];
   1839                 float t1 = (float)ptr[1];
   1840                 ipp_coeffs[i] = t0;
   1841                 ipp_coeffs[i+1] = t1;
   1842                 t0 = (float)ptr[2];
   1843                 t1 = (float)ptr[3];
   1844                 ipp_coeffs[i+2] = t0;
   1845                 ipp_coeffs[i+3] = t1;
   1846             }
   1847         }
   1848 
   1849         if( !src_seq )
   1850         {
   1851             int srcstep = src->step;
   1852             int dststep = dst->step;
   1853             size = cvGetMatSize( src );
   1854 
   1855             if( CV_IS_MAT_CONT( src->type & dst->type ))
   1856             {
   1857                 size.width *= size.height;
   1858                 size.height = 1;
   1859                 srcstep = dststep = CV_STUB_STEP;
   1860             }
   1861 
   1862             if( lut_func )
   1863                 lut_func( src->data.ptr, src->step, dst->data.ptr,
   1864                           dst->step, size, lut->data.ptr );
   1865             else if( ipp_func )
   1866             {
   1867                 IPPI_CALL( ipp_func( src->data.ptr, srcstep, dst->data.ptr,
   1868                                      dststep, size, ipp_coeffs ));
   1869             }
   1870             else if( diag_transform )
   1871                 diag_func( src->data.ptr, src->step, dst->data.ptr,
   1872                            dst->step, size, buffer );
   1873             else
   1874                 func( src->data.ptr, src->step, dst->data.ptr,
   1875                       dst->step, size, buffer, dst_cn );
   1876         }
   1877         else
   1878         {
   1879             CvSeqBlock* src_block = src_seq->first;
   1880             CvSeqBlock* dst_block = dst_seq->first;
   1881             int src_idx = 0, dst_idx = 0;
   1882             int src_elem_size = CV_ELEM_SIZE(src_seq->flags);
   1883             int dst_elem_size = CV_ELEM_SIZE(dst_seq->flags);
   1884 
   1885             for( i = src_seq->total; i > 0; )
   1886             {
   1887                 int src_len = src_block->count - src_idx;
   1888                 int dst_len = dst_block->count - dst_idx;
   1889                 const void* srcptr = src_block->data + src_idx*src_elem_size;
   1890                 void* dstptr = dst_block->data + dst_idx*dst_elem_size;
   1891                 src_len = MIN(src_len, dst_len);
   1892 
   1893                 if( lut_func )
   1894                     lut_func( srcptr, CV_STUB_STEP, dstptr, CV_STUB_STEP,
   1895                               cvSize( src_len, 1 ), lut->data.ptr );
   1896                 else if( ipp_func )
   1897                 {
   1898                     IPPI_CALL( ipp_func( srcptr, CV_STUB_STEP, dstptr, CV_STUB_STEP,
   1899                                          cvSize( src_len, 1 ), ipp_coeffs ));
   1900                 }
   1901                 else if( diag_transform )
   1902                     diag_func( srcptr, CV_STUB_STEP, dstptr, CV_STUB_STEP,
   1903                                cvSize( src_len, 1 ), buffer );
   1904                 else
   1905                     func( srcptr, CV_STUB_STEP, dstptr, CV_STUB_STEP,
   1906                           cvSize( src_len, 1 ), buffer, dst_cn );
   1907 
   1908                 if( (src_idx += src_len) == src_block->count )
   1909                     src_block = src_block->next, src_idx = 0;
   1910                 if( (dst_idx += src_len) == dst_block->count )
   1911                     dst_block = dst_block->next, dst_idx = 0;
   1912                 i -= src_len;
   1913             }
   1914         }
   1915     }
   1916 
   1917     __END__;
   1918 
   1919     cvReleaseMat( &lut );
   1920 }
   1921 
   1922 
   1923 /****************************************************************************************\
   1924 *                                        cvPerspectiveTransform                          *
   1925 \****************************************************************************************/
   1926 
   1927 #define ICV_PERSPECTIVE_TRANSFORM_FUNC_2( flavor, arrtype )                             \
   1928 static CvStatus CV_STDCALL                                                              \
   1929 icvPerspectiveTransform_##flavor##_C2R( const arrtype* src, int srcstep,                \
   1930                                         arrtype* dst, int dststep,                      \
   1931                                         CvSize size, const double* mat )                \
   1932 {                                                                                       \
   1933     int i;                                                                              \
   1934     size.width *= 2;                                                                    \
   1935     srcstep /= sizeof(src[0]); dststep /= sizeof(dst[0]);                               \
   1936                                                                                         \
   1937     for( ; size.height--; src += srcstep, dst += dststep )                              \
   1938     {                                                                                   \
   1939         for( i = 0; i < size.width; i += 2 )                                            \
   1940         {                                                                               \
   1941             arrtype x = src[i], y = src[i + 1];                                         \
   1942             double w = x*mat[6] + y*mat[7] + mat[8];                                    \
   1943                                                                                         \
   1944             if( fabs(w) > FLT_EPSILON )                                                 \
   1945             {                                                                           \
   1946                 w = 1./w;                                                               \
   1947                 dst[i] = (arrtype)((x*mat[0] + y*mat[1] + mat[2]) * w);                 \
   1948                 dst[i+1] = (arrtype)((x*mat[3] + y*mat[4] + mat[5]) * w);               \
   1949             }                                                                           \
   1950             else                                                                        \
   1951             {                                                                           \
   1952                 dst[i] = (arrtype)0;                                                    \
   1953                 dst[i+1] = (arrtype)0;                                                  \
   1954             }                                                                           \
   1955         }                                                                               \
   1956     }                                                                                   \
   1957                                                                                         \
   1958     return CV_OK;                                                                       \
   1959 }
   1960 
   1961 
   1962 #define ICV_PERSPECTIVE_TRANSFORM_FUNC_3( flavor, arrtype )                             \
   1963 static CvStatus CV_STDCALL                                                              \
   1964 icvPerspectiveTransform_##flavor##_C3R( const arrtype* src, int srcstep,                \
   1965                                              arrtype* dst, int dststep,                 \
   1966                                              CvSize size, const double* mat )           \
   1967 {                                                                                       \
   1968     int i;                                                                              \
   1969     size.width *= 3;                                                                    \
   1970     srcstep /= sizeof(src[0]); dststep /= sizeof(dst[0]);                               \
   1971                                                                                         \
   1972     for( ; size.height--; src += srcstep, dst += dststep )                              \
   1973     {                                                                                   \
   1974         for( i = 0; i < size.width; i += 3 )                                            \
   1975         {                                                                               \
   1976             arrtype x = src[i], y = src[i + 1], z = src[i + 2];                         \
   1977             double w = x*mat[12] + y*mat[13] + z*mat[14] + mat[15];                     \
   1978                                                                                         \
   1979             if( fabs(w) > FLT_EPSILON )                                                 \
   1980             {                                                                           \
   1981                 w = 1./w;                                                               \
   1982                 dst[i] = (arrtype)((x*mat[0] + y*mat[1] + z*mat[2] + mat[3]) * w);      \
   1983                 dst[i+1] = (arrtype)((x*mat[4] + y*mat[5] + z*mat[6] + mat[7]) * w);    \
   1984                 dst[i+2] = (arrtype)((x*mat[8] + y*mat[9] + z*mat[10] + mat[11]) * w);  \
   1985             }                                                                           \
   1986             else                                                                        \
   1987             {                                                                           \
   1988                 dst[i] = (arrtype)0;                                                    \
   1989                 dst[i+1] = (arrtype)0;                                                  \
   1990                 dst[i+2] = (arrtype)0;                                                  \
   1991             }                                                                           \
   1992         }                                                                               \
   1993     }                                                                                   \
   1994                                                                                         \
   1995     return CV_OK;                                                                       \
   1996 }
   1997 
   1998 ICV_PERSPECTIVE_TRANSFORM_FUNC_2( 32f, float )
   1999 ICV_PERSPECTIVE_TRANSFORM_FUNC_2( 64f, double )
   2000 ICV_PERSPECTIVE_TRANSFORM_FUNC_3( 32f, float )
   2001 ICV_PERSPECTIVE_TRANSFORM_FUNC_3( 64f, double )
   2002 
   2003 static void icvInitPerspectiveTransformTable( CvFuncTable* tab2, CvFuncTable* tab3 )\
   2004 {                                                                                   \
   2005     tab2->fn_2d[CV_32F] = (void*)icvPerspectiveTransform_32f_C2R;                   \
   2006     tab2->fn_2d[CV_64F] = (void*)icvPerspectiveTransform_64f_C2R;                   \
   2007     tab3->fn_2d[CV_32F] = (void*)icvPerspectiveTransform_32f_C3R;                   \
   2008     tab3->fn_2d[CV_64F] = (void*)icvPerspectiveTransform_64f_C3R;                   \
   2009 }
   2010 
   2011 
   2012 CV_IMPL void
   2013 cvPerspectiveTransform( const CvArr* srcarr, CvArr* dstarr, const CvMat* mat )
   2014 {
   2015     static CvFuncTable tab[2];
   2016     static int inittab = 0;
   2017     double buffer[16];
   2018 
   2019     CV_FUNCNAME( "cvPerspectiveProject" );
   2020 
   2021     __BEGIN__;
   2022 
   2023     CvMat sstub, *src = (CvMat*)srcarr;
   2024     CvMat dstub, *dst = (CvMat*)dstarr;
   2025     int i, j, type, cn;
   2026     CvFunc2D_2A1P func = 0;
   2027     CvSize size;
   2028 
   2029     if( !inittab )
   2030     {
   2031         icvInitPerspectiveTransformTable( &tab[0], &tab[1] );
   2032         inittab = 1;
   2033     }
   2034 
   2035     if( !CV_IS_MAT( src ))
   2036     {
   2037         int coi = 0;
   2038         CV_CALL( src = cvGetMat( src, &sstub, &coi ));
   2039 
   2040         if( coi != 0 )
   2041             CV_ERROR( CV_BadCOI, "" );
   2042     }
   2043 
   2044     if( !CV_IS_MAT( dst ))
   2045     {
   2046         int coi = 0;
   2047         CV_CALL( dst = cvGetMat( dst, &dstub, &coi ));
   2048 
   2049         if( coi != 0 )
   2050             CV_ERROR( CV_BadCOI, "" );
   2051     }
   2052 
   2053     if( !CV_ARE_TYPES_EQ( src, dst ))
   2054         CV_ERROR( CV_StsUnmatchedFormats, "" );
   2055 
   2056     if( !CV_ARE_SIZES_EQ( src, dst ))
   2057         CV_ERROR( CV_StsUnmatchedSizes, "" );
   2058 
   2059     type = CV_MAT_TYPE( src->type );
   2060     cn = CV_MAT_CN( type );
   2061 
   2062     if( cn != 2 && cn != 3 )
   2063         CV_ERROR( CV_BadNumChannels, cvUnsupportedFormat );
   2064 
   2065     if( !CV_IS_MAT( mat ))
   2066         CV_ERROR( CV_StsBadArg, "Invalid transformation matrix" );
   2067 
   2068     if( mat->rows != cn + 1 && mat->cols != mat->rows )
   2069         CV_ERROR( CV_StsBadSize,
   2070         "The size of transform matrix must be equal to number of channels" );
   2071 
   2072     if( CV_MAT_TYPE( mat->type ) == CV_64FC1 )
   2073     {
   2074         for( i = 0; i <= cn; i++ )
   2075         {
   2076             for( j = 0; j <= cn; j++ )
   2077                 buffer[i*(cn+1) + j] = ((double*)(mat->data.ptr + mat->step*i))[j];
   2078         }
   2079     }
   2080     else if( CV_MAT_TYPE( mat->type ) == CV_32FC1 )
   2081     {
   2082         for( i = 0; i <= cn; i++ )
   2083         {
   2084             for( j = 0; j <= cn; j++ )
   2085                 buffer[i*(cn+1) + j] = ((float*)(mat->data.ptr + mat->step*i))[j];
   2086         }
   2087     }
   2088     else
   2089     {
   2090         CV_ERROR( CV_StsUnsupportedFormat, "Rotation matrix must be 32fC1 or 64fC1" );
   2091     }
   2092 
   2093     func = (CvFunc2D_2A1P)tab[cn == 3].fn_2d[CV_MAT_DEPTH(type)];
   2094 
   2095     if( !func )
   2096         CV_ERROR( CV_StsUnsupportedFormat, "" );
   2097 
   2098     size = cvGetMatSize( src );
   2099 
   2100     if( CV_IS_MAT_CONT( src->type & dst->type ))
   2101     {
   2102         size.width *= size.height;
   2103         size.height = 1;
   2104     }
   2105 
   2106     IPPI_CALL( func( src->data.ptr, src->step, dst->data.ptr, dst->step, size, buffer));
   2107 
   2108     CV_CHECK_NANS( dst );
   2109 
   2110     __END__;
   2111 }
   2112 
   2113 
   2114 /****************************************************************************************\
   2115 *                                       cvScaleAdd                                       *
   2116 \****************************************************************************************/
   2117 
   2118 #define  ICV_DEF_MULADDC_CASE_C1( arrtype, temptype, src1, src2, dst, len )     \
   2119 {                                                                               \
   2120     int i;                                                                      \
   2121                                                                                 \
   2122     for( i = 0; i <= (len) - 4; i += 4 )                                        \
   2123     {                                                                           \
   2124         temptype t0 = (src1)[i]*s0 + (src2)[i];                                 \
   2125         temptype t1 = (src1)[i+1]*s0 + (src2)[i+1];                             \
   2126                                                                                 \
   2127         (dst)[i] = (arrtype)t0;                                                 \
   2128         (dst)[i+1] = (arrtype)t1;                                               \
   2129                                                                                 \
   2130         t0 = (src1)[i+2]*s0 + (src2)[i+2];                                      \
   2131         t1 = (src1)[i+3]*s0 + (src2)[i+3];                                      \
   2132                                                                                 \
   2133         (dst)[i+2] = (arrtype)t0;                                               \
   2134         (dst)[i+3] = (arrtype)t1;                                               \
   2135     }                                                                           \
   2136                                                                                 \
   2137     for( ; i < (len); i++ )                                                     \
   2138     {                                                                           \
   2139         temptype t0 = (src1)[i]*s0 + (src2)[i];                                 \
   2140         (dst)[i] = (arrtype)t0;                                                 \
   2141     }                                                                           \
   2142 }
   2143 
   2144 
   2145 #define  ICV_DEF_MULADDC_CASE_C2( arrtype, temptype, src1, src2, dst, len )     \
   2146 {                                                                               \
   2147     int i;                                                                      \
   2148                                                                                 \
   2149     for( i = 0; i <= (len) - 4; i += 4 )                                        \
   2150     {                                                                           \
   2151         temptype t0 = (src1)[i]*s0 - (src1)[i+1]*s1 + (src2)[i];                \
   2152         temptype t1 = (src1)[i]*s1 + (src1)[i+1]*s0 + (src2)[i+1];              \
   2153                                                                                 \
   2154         (dst)[i] = (arrtype)t0;                                                 \
   2155         (dst)[i+1] = (arrtype)t1;                                               \
   2156                                                                                 \
   2157         t0 = (src1)[i+2]*s0 - (src1)[i+3]*s1 + (src2)[i+2];                     \
   2158         t1 = (src1)[i+2]*s1 + (src1)[i+3]*s0 + (src2)[i+3];                     \
   2159                                                                                 \
   2160         (dst)[i+2] = (arrtype)t0;                                               \
   2161         (dst)[i+3] = (arrtype)t1;                                               \
   2162     }                                                                           \
   2163                                                                                 \
   2164     for( ; i < (len); i += 2 )                                                  \
   2165     {                                                                           \
   2166         temptype t0 = (src1)[i]*s0 - (src1)[i+1]*s1 + (src2)[i];                \
   2167         temptype t1 = (src1)[i]*s1 + (src1)[i+1]*s0 + (src2)[i+1];              \
   2168                                                                                 \
   2169         (dst)[i] = (arrtype)t0;                                                 \
   2170         (dst)[i+1] = (arrtype)t1;                                               \
   2171     }                                                                           \
   2172 }
   2173 
   2174 
   2175 #define  ICV_DEF_MULADDS_FUNC( flavor, arrtype, scalartype, entry, cn )     \
   2176 static CvStatus CV_STDCALL                                                  \
   2177 icvMulAddC_##flavor( const arrtype* src1, int srcstep1,                     \
   2178                       const arrtype* src2, int srcstep2,                    \
   2179                       arrtype* dst, int dststep, CvSize size,               \
   2180                       const scalartype* scalar )                            \
   2181 {                                                                           \
   2182     entry(scalartype);                                                      \
   2183     size.width *= (cn);                                                     \
   2184     srcstep1 /= sizeof(src1[0]); srcstep2 /= sizeof(src2[0]);               \
   2185     dststep /= sizeof(dst[0]);                                              \
   2186                                                                             \
   2187     for( ; size.height--; src1+=srcstep1, src2+=srcstep2, dst+=dststep )    \
   2188     {                                                                       \
   2189         ICV_DEF_MULADDC_CASE_C##cn( arrtype, scalartype, src1, src2,        \
   2190                                     dst, size.width )                       \
   2191     }                                                                       \
   2192                                                                             \
   2193     return CV_OK;                                                           \
   2194 }
   2195 
   2196 
   2197 ICV_DEF_MULADDS_FUNC( 32f_C1R, float, double, CV_UN_ENTRY_C1, 1 )
   2198 ICV_DEF_MULADDS_FUNC( 32f_C2R, float, double, CV_UN_ENTRY_C2, 2 )
   2199 ICV_DEF_MULADDS_FUNC( 64f_C1R, double, double, CV_UN_ENTRY_C1, 1 )
   2200 ICV_DEF_MULADDS_FUNC( 64f_C2R, double, double, CV_UN_ENTRY_C2, 2 )
   2201 
   2202 
   2203 static void
   2204 icvInitMulAddCTable( CvBigFuncTable* tab )
   2205 {
   2206     tab->fn_2d[CV_32FC1] = (void*)icvMulAddC_32f_C1R;
   2207     tab->fn_2d[CV_32FC2] = (void*)icvMulAddC_32f_C2R;
   2208     tab->fn_2d[CV_64FC1] = (void*)icvMulAddC_64f_C1R;
   2209     tab->fn_2d[CV_64FC2] = (void*)icvMulAddC_64f_C2R;
   2210 }
   2211 
   2212 
   2213 CV_IMPL void
   2214 cvScaleAdd( const CvArr* srcarr1, CvScalar scale,
   2215             const CvArr* srcarr2, CvArr* dstarr )
   2216 {
   2217     static CvBigFuncTable muladds_tab;
   2218     static int inittab = 0;
   2219 
   2220     CV_FUNCNAME( "cvScaleAdd" );
   2221 
   2222     __BEGIN__;
   2223 
   2224     CvMat stub1, *src1 = (CvMat*)srcarr1;
   2225     CvMat stub2, *src2 = (CvMat*)srcarr2;
   2226     CvMat stub, *dst = (CvMat*)dstarr;
   2227     CvSize size;
   2228     int type;
   2229 
   2230     if( !CV_IS_MAT( src1 ) || !CV_IS_MAT(src2) || !CV_IS_MAT(dst))
   2231     {
   2232         int coi1 = 0, coi2 = 0, coi3 = 0;
   2233         CV_CALL( src1 = cvGetMat( src1, &stub1, &coi1 ));
   2234         CV_CALL( src2 = cvGetMat( src2, &stub2, &coi2 ));
   2235         CV_CALL( dst = cvGetMat( dst, &stub, &coi3 ));
   2236 
   2237         if( coi1 + coi2 + coi3 != 0 )
   2238             CV_ERROR( CV_BadCOI, "" );
   2239     }
   2240 
   2241     if( !CV_ARE_TYPES_EQ( src1, dst ) || !CV_ARE_TYPES_EQ( src2, dst ))
   2242         CV_ERROR( CV_StsUnmatchedFormats, "" );
   2243 
   2244     if( !CV_ARE_SIZES_EQ( src1, dst ) || !CV_ARE_SIZES_EQ( src2, dst ))
   2245         CV_ERROR( CV_StsUnmatchedSizes, "" );
   2246 
   2247     type = CV_MAT_TYPE( src1->type );
   2248     size = cvGetMatSize( src1 );
   2249 
   2250     if( CV_IS_MAT_CONT( src1->type & src2->type & dst->type ))
   2251     {
   2252         size.width *= size.height;
   2253 
   2254         if( size.width <= CV_MAX_INLINE_MAT_OP_SIZE )
   2255         {
   2256             if( type == CV_32FC1 )
   2257             {
   2258                 float* mA = src1->data.fl;
   2259                 float* mB = src2->data.fl;
   2260                 float* mC = dst->data.fl;
   2261 
   2262                 do
   2263                 {
   2264                     mC[size.width - 1] = (float)(mA[size.width - 1]*scale.val[0] +
   2265                                          mB[size.width - 1]);
   2266                 }
   2267                 while( --size.width );
   2268 
   2269                 EXIT;
   2270             }
   2271 
   2272             if( type == CV_64FC1 )
   2273             {
   2274                 double* mA = src1->data.db;
   2275                 double* mB = src2->data.db;
   2276                 double* mC = dst->data.db;
   2277 
   2278                 do
   2279                 {
   2280                     mC[size.width - 1] = mA[size.width - 1]*scale.val[0] +
   2281                                          mB[size.width - 1];
   2282                 }
   2283                 while( --size.width );
   2284 
   2285                 EXIT;
   2286             }
   2287         }
   2288 
   2289         size.height = 1;
   2290     }
   2291 
   2292     if( !inittab )
   2293     {
   2294         icvInitMulAddCTable( &muladds_tab );
   2295         inittab = 1;
   2296     }
   2297 
   2298     if( CV_MAT_CN(type) > 2 )
   2299         CV_ERROR( CV_StsOutOfRange, "The function only supports 1- and 2-channel arrays" );
   2300 
   2301     {
   2302         CvFunc2D_3A1P func = (CvFunc2D_3A1P)(muladds_tab.fn_2d[type]);
   2303 
   2304         if( !func )
   2305             CV_ERROR( CV_StsUnsupportedFormat, "" );
   2306 
   2307         IPPI_CALL( func( src1->data.ptr, src1->step, src2->data.ptr, src2->step,
   2308                          dst->data.ptr, dst->step, size, scale.val ));
   2309     }
   2310 
   2311     CV_CHECK_NANS( dst );
   2312 
   2313     __END__;
   2314 }
   2315 
   2316 
   2317 /****************************************************************************************\
   2318 *                                    cvCalcCovarMatrix                                   *
   2319 \****************************************************************************************/
   2320 
   2321 #define ICV_DOT_PRODUCT_CASE( flavor, srctype, avgtype, load_macro )                    \
   2322 static CvStatus CV_STDCALL                                                              \
   2323 icvDotProductShifted_##flavor##_C1R( const srctype* vec1, int vecstep1,                 \
   2324                                      const srctype* vec2, int vecstep2,                 \
   2325                                      const avgtype* avg, int avgstep,                   \
   2326                                      CvSize size, double* _result )                     \
   2327 {                                                                                       \
   2328     double result = 0;                                                                  \
   2329     vecstep1 /= sizeof(vec1[0]); vecstep2 /= sizeof(vec2[0]); avgstep /= sizeof(avg[0]);\
   2330                                                                                         \
   2331     for( ; size.height--; vec1 += vecstep1, vec2 += vecstep2, avg += avgstep )          \
   2332     {                                                                                   \
   2333         int x;                                                                          \
   2334         for( x = 0; x <= size.width - 4; x += 4 )                                       \
   2335             result += (load_macro(vec1[x]) - avg[x])*(load_macro(vec2[x]) - avg[x]) +   \
   2336                 (load_macro(vec1[x+1]) - avg[x+1])*(load_macro(vec2[x+1]) - avg[x+1]) + \
   2337                 (load_macro(vec1[x+2]) - avg[x+2])*(load_macro(vec2[x+2]) - avg[x+2]) + \
   2338                 (load_macro(vec1[x+3]) - avg[x+3])*(load_macro(vec2[x+3]) - avg[x+3]);  \
   2339         for( ; x < size.width; x++ )                                                    \
   2340             result += (load_macro(vec1[x]) - avg[x])*(load_macro(vec2[x]) - avg[x]);    \
   2341     }                                                                                   \
   2342                                                                                         \
   2343     *_result = result;                                                                  \
   2344     return CV_OK;                                                                       \
   2345 }
   2346 
   2347 
   2348 ICV_DOT_PRODUCT_CASE( 8u32f, uchar, float, CV_8TO32F )
   2349 ICV_DOT_PRODUCT_CASE( 8u64f, uchar, double, CV_8TO32F )
   2350 ICV_DOT_PRODUCT_CASE( 16u32f, ushort, float, CV_NOP )
   2351 ICV_DOT_PRODUCT_CASE( 16u64f, ushort, double, CV_NOP )
   2352 ICV_DOT_PRODUCT_CASE( 16s32f, short, float, CV_NOP )
   2353 ICV_DOT_PRODUCT_CASE( 16s64f, short, double, CV_NOP )
   2354 ICV_DOT_PRODUCT_CASE( 32f, float, float, CV_NOP )
   2355 ICV_DOT_PRODUCT_CASE( 32f64f, float, double, CV_NOP )
   2356 ICV_DOT_PRODUCT_CASE( 64f, double, double, CV_NOP )
   2357 
   2358 static void  icvInitDotProductShiftedTable( CvFuncTable* tabfl, CvFuncTable* tabdb )
   2359 {
   2360     tabfl->fn_2d[CV_8U] = (void*)icvDotProductShifted_8u32f_C1R;
   2361     tabfl->fn_2d[CV_8S] = 0;
   2362     tabfl->fn_2d[CV_16U] = (void*)icvDotProductShifted_16u32f_C1R;
   2363     tabfl->fn_2d[CV_16S] = (void*)icvDotProductShifted_16s32f_C1R;
   2364     tabfl->fn_2d[CV_32S] = 0;
   2365     tabfl->fn_2d[CV_32F] = (void*)icvDotProductShifted_32f_C1R;
   2366     tabfl->fn_2d[CV_64F] = 0;
   2367 
   2368     tabdb->fn_2d[CV_8U] = (void*)icvDotProductShifted_8u64f_C1R;
   2369     tabdb->fn_2d[CV_8S] = 0;
   2370     tabdb->fn_2d[CV_16U] = (void*)icvDotProductShifted_16u64f_C1R;
   2371     tabdb->fn_2d[CV_16S] = (void*)icvDotProductShifted_16s64f_C1R;
   2372     tabdb->fn_2d[CV_32S] = 0;
   2373     tabdb->fn_2d[CV_32F] = (void*)icvDotProductShifted_32f64f_C1R;
   2374     tabdb->fn_2d[CV_64F] = (void*)icvDotProductShifted_64f_C1R;
   2375 }
   2376 
   2377 #define ICV_EXT_PRODUCT_CASE( flavor, srctype, avgtype, load_macro )                    \
   2378 static CvStatus CV_STDCALL                                                              \
   2379 icvExtProductShifted_##flavor##_C1R( const srctype* vec, int vecstep,                   \
   2380                                      const avgtype* avg, int avgstep,                   \
   2381                                      avgtype* dst, int dststep,                         \
   2382                                      CvSize size, avgtype* tempbuf )                    \
   2383 {                                                                                       \
   2384     int x, y, dstsize = size.width * size.height;                                       \
   2385                                                                                         \
   2386     vecstep /= sizeof(vec[0]); avgstep /= sizeof(avg[0]);                               \
   2387     for( y = 0; y < size.height; y++, vec += vecstep, avg += avgstep )                  \
   2388         for( x = 0; x < size.width; x++ )                                               \
   2389             *tempbuf++ = load_macro(vec[x]) - avg[x];                                   \
   2390     tempbuf -= dstsize;                                                                 \
   2391                                                                                         \
   2392     dststep /= sizeof(dst[0]);                                                          \
   2393     for( y = 0; y < dstsize; y++, dst += dststep )                                      \
   2394     {                                                                                   \
   2395         double ty = tempbuf[y];                                                         \
   2396         for( x = 0; x <= y - 3; x += 4 )                                                \
   2397         {                                                                               \
   2398             double t0 = dst[x] + ty*tempbuf[x];                                         \
   2399             double t1 = dst[x+1] + ty*tempbuf[x+1];                                     \
   2400             dst[x] = (avgtype)t0;                                                       \
   2401             dst[x+1] = (avgtype)t1;                                                     \
   2402             t0 = dst[x+2] + ty*tempbuf[x+2];                                            \
   2403             t1 = dst[x+3] + ty*tempbuf[x+3];                                            \
   2404             dst[x+2] = (avgtype)t0;                                                     \
   2405             dst[x+3] = (avgtype)t1;                                                     \
   2406         }                                                                               \
   2407         for( ; x <= y; x++ )                                                            \
   2408             dst[x] = (avgtype)(dst[x] + ty*tempbuf[x]);                                 \
   2409     }                                                                                   \
   2410                                                                                         \
   2411     return CV_OK;                                                                       \
   2412 }
   2413 
   2414 ICV_EXT_PRODUCT_CASE( 8u32f, uchar, float, CV_8TO32F )
   2415 ICV_EXT_PRODUCT_CASE( 8u64f, uchar, double, CV_8TO32F )
   2416 ICV_EXT_PRODUCT_CASE( 16u32f, ushort, float, CV_NOP )
   2417 ICV_EXT_PRODUCT_CASE( 16u64f, ushort, double, CV_NOP )
   2418 ICV_EXT_PRODUCT_CASE( 16s32f, short, float, CV_NOP )
   2419 ICV_EXT_PRODUCT_CASE( 16s64f, short, double, CV_NOP )
   2420 ICV_EXT_PRODUCT_CASE( 32f, float, float, CV_NOP )
   2421 ICV_EXT_PRODUCT_CASE( 32f64f, float, double, CV_NOP )
   2422 ICV_EXT_PRODUCT_CASE( 64f, double, double, CV_NOP )
   2423 
   2424 
   2425 static void  icvInitExtProductShiftedTable( CvFuncTable* tabfl, CvFuncTable* tabdb )
   2426 {
   2427     tabfl->fn_2d[CV_8U] = (void*)icvExtProductShifted_8u32f_C1R;
   2428     tabfl->fn_2d[CV_8S] = 0;
   2429     tabfl->fn_2d[CV_16U] = (void*)icvExtProductShifted_16u32f_C1R;
   2430     tabfl->fn_2d[CV_16S] = (void*)icvExtProductShifted_16s32f_C1R;
   2431     tabfl->fn_2d[CV_32S] = 0;
   2432     tabfl->fn_2d[CV_32F] = (void*)icvExtProductShifted_32f_C1R;
   2433     tabfl->fn_2d[CV_64F] = 0;
   2434 
   2435     tabdb->fn_2d[CV_8U] = (void*)icvExtProductShifted_8u64f_C1R;
   2436     tabdb->fn_2d[CV_8S] = 0;
   2437     tabdb->fn_2d[CV_16U] = (void*)icvExtProductShifted_16u64f_C1R;
   2438     tabdb->fn_2d[CV_16S] = (void*)icvExtProductShifted_16s64f_C1R;
   2439     tabdb->fn_2d[CV_32S] = 0;
   2440     tabdb->fn_2d[CV_32F] = (void*)icvExtProductShifted_32f64f_C1R;
   2441     tabdb->fn_2d[CV_64F] = (void*)icvExtProductShifted_64f_C1R;
   2442 }
   2443 
   2444 
   2445 typedef struct vec_data
   2446 {
   2447     void* ptr;
   2448     int step;
   2449 }
   2450 vec_data;
   2451 
   2452 CV_IMPL void
   2453 cvCalcCovarMatrix( const CvArr** vecarr, int count,
   2454                    CvArr* covarr, CvArr* avgarr, int flags )
   2455 {
   2456     static CvFuncTable dot_tab[2];
   2457     static CvFuncTable ext_tab[2];
   2458     static int inittab = 0;
   2459     vec_data* vecdata = 0;
   2460     CvMat *tempvec = 0;
   2461 
   2462     CV_FUNCNAME( "cvCalcCovarMatrix" );
   2463 
   2464     __BEGIN__;
   2465 
   2466     CvMat covstub, *cov = (CvMat*)covarr;
   2467     CvMat avgstub, *avg = (CvMat*)avgarr;
   2468     CvMat vecstub0, *vecmat = 0;
   2469     CvSize srcsize, contsize;
   2470     int srctype = 0, dsttype = 0;
   2471     int i, j;
   2472     int cont_flag, vec_delta = 0, vec_step = 0;
   2473     int is_covar_normal = (flags & CV_COVAR_NORMAL) != 0;
   2474     double scale;
   2475 
   2476     if( !inittab )
   2477     {
   2478         icvInitDotProductShiftedTable( dot_tab + 0, dot_tab + 1 );
   2479         icvInitExtProductShiftedTable( ext_tab + 0, ext_tab + 1 );
   2480         inittab = 1;
   2481     }
   2482 
   2483     if( !vecarr )
   2484         CV_ERROR( CV_StsNullPtr, "NULL vec pointer" );
   2485 
   2486     CV_CALL( cov = cvGetMat( cov, &covstub ));
   2487     CV_CALL( avg = cvGetMat( avg, &avgstub ));
   2488 
   2489     if( !CV_ARE_TYPES_EQ( cov, avg ))
   2490         CV_ERROR( CV_StsUnmatchedFormats,
   2491         "Covariation matrix and average vector should have the same types" );
   2492 
   2493     dsttype = CV_MAT_TYPE( cov->type );
   2494     if( dsttype != CV_32FC1 && dsttype != CV_64FC1 )
   2495         CV_ERROR( CV_StsUnsupportedFormat, "Covariation matrix must be 32fC1 or 64fC1" );
   2496 
   2497     if( cov->rows != cov->cols )
   2498         CV_ERROR( CV_StsBadSize, "Covariation matrix must be square" );
   2499 
   2500     srcsize = cvGetMatSize( avg );
   2501     contsize.width = srcsize.width * srcsize.height;
   2502     contsize.height = 1;
   2503     cont_flag = avg->type;
   2504 
   2505     if( flags & (CV_COVAR_ROWS|CV_COVAR_COLS) )
   2506     {
   2507         CV_CALL( vecmat = cvGetMat( vecarr[0], &vecstub0 ));
   2508         srctype = CV_MAT_TYPE(vecmat->type);
   2509         if( flags & CV_COVAR_COLS )
   2510         {
   2511             count = vecmat->cols;
   2512             if( avg->cols != 1 || avg->rows != vecmat->rows )
   2513                 CV_ERROR( CV_StsUnmatchedSizes,
   2514                 "The number of input vectors does not match to avg vector size" );
   2515             cont_flag = 0;
   2516             vec_delta = CV_ELEM_SIZE(vecmat->type);
   2517             vec_step = vecmat->step;
   2518         }
   2519         else
   2520         {
   2521             count = vecmat->rows;
   2522             if( avg->rows != 1 || avg->cols != vecmat->cols )
   2523                 CV_ERROR( CV_StsUnmatchedSizes,
   2524                 "The number of input vectors does not match to avg vector size" );
   2525             vec_delta = vecmat->step;
   2526             vec_step = CV_STUB_STEP;
   2527         }
   2528 
   2529         if( !(flags & CV_COVAR_USE_AVG) )
   2530             CV_CALL( cvReduce( vecmat, avg, -1, CV_REDUCE_AVG ));
   2531 
   2532         scale = !(flags & CV_COVAR_SCALE) ? 1. : 1./count;
   2533 
   2534         cvMulTransposed( vecmat, cov, ((flags & CV_COVAR_ROWS)!=0) ^ ((flags & CV_COVAR_NORMAL)==0), avg, scale );
   2535         EXIT;
   2536     }
   2537 
   2538     scale = !(flags & CV_COVAR_SCALE) ? 1. : 1./count;
   2539 
   2540     if( is_covar_normal )
   2541     {
   2542         if( count <= 0 )
   2543             CV_ERROR( CV_StsBadSize,
   2544             "The number of vectors is zero or negative" );
   2545         if( cov->rows != contsize.width )
   2546             CV_ERROR( CV_StsUnmatchedSizes,
   2547             "The size of input vectors does not match with the size of covariation matrix" );
   2548 
   2549         CV_CALL( tempvec = cvCreateMat( avg->rows, avg->cols, dsttype ));
   2550     }
   2551     else if( count != cov->rows )
   2552         CV_ERROR( CV_StsUnmatchedSizes,
   2553         "The vector count and covariance matrix size do not match" );
   2554 
   2555     if( !(flags & (CV_COVAR_ROWS|CV_COVAR_COLS)) )
   2556     {
   2557         if( !(flags & CV_COVAR_USE_AVG) )
   2558             cvZero( avg );
   2559 
   2560         CV_CALL( vecdata = (vec_data*)cvAlloc( count*sizeof(vecdata[0])));
   2561 
   2562         for( i = 0; i < count; i++ )
   2563         {
   2564             CvMat vecstub, *vec = (CvMat*)vecarr[i];
   2565             CvMat* temp;
   2566 
   2567             if( !CV_IS_MAT(vec) )
   2568                 CV_CALL( vec = cvGetMat( vec, &vecstub ));
   2569 
   2570             if( !CV_ARE_SIZES_EQ( vec, avg ))
   2571                 CV_ERROR( CV_StsUnmatchedSizes,
   2572                 "All input vectors and average vector must have the same size" );
   2573 
   2574             vecdata[i].ptr = vec->data.ptr;
   2575             vecdata[i].step = vec->step;
   2576             cont_flag &= vec->type;
   2577             temp = vec;
   2578             if( i == 0 )
   2579             {
   2580                 srctype = CV_MAT_TYPE( vec->type );
   2581                 if( CV_MAT_CN( srctype ) != 1 )
   2582                     CV_ERROR( CV_BadNumChannels, "All vectors must have a single channel" );
   2583                 if( srctype != dsttype && !tempvec && !(flags & CV_COVAR_USE_AVG))
   2584                     CV_CALL( tempvec = cvCreateMat( vec->rows, vec->cols, dsttype ));
   2585             }
   2586             else if( CV_MAT_TYPE(vec->type) != srctype )
   2587                 CV_ERROR( CV_StsUnmatchedFormats,
   2588                 "All input vectors must have the same type" );
   2589 
   2590             if( !(flags & CV_COVAR_USE_AVG) )
   2591             {
   2592                 if( tempvec )
   2593                 {
   2594                     temp = tempvec;
   2595                     cvConvert( vec, temp );
   2596                 }
   2597                 cvAdd( temp, avg, avg );
   2598             }
   2599         }
   2600 
   2601         if( !(flags & CV_COVAR_USE_AVG) )
   2602             cvScale( avg, avg, 1./count );
   2603     }
   2604 
   2605     cont_flag = CV_IS_MAT_CONT( cont_flag );
   2606     if( cont_flag )
   2607         srcsize = contsize;
   2608 
   2609     if( !is_covar_normal )
   2610     {
   2611         CvFunc2D_3A1P dot_func =
   2612             (CvFunc2D_3A1P)dot_tab[dsttype == CV_64FC1].fn_2d[CV_MAT_DEPTH(srctype)];
   2613 
   2614         if( !dot_func )
   2615             CV_ERROR( CV_StsUnsupportedFormat,
   2616             "The format of input vectors is not supported" );
   2617 
   2618         for( i = 0; i < count; i++ )
   2619         {
   2620             int a, b, delta;
   2621             if( !(i & 1) )
   2622                 a = 0, b = i+1, delta = 1;
   2623             else
   2624                 a = i, b = -1, delta = -1;
   2625 
   2626             for( j = a; j != b; j += delta )
   2627             {
   2628                 double result = 0;
   2629                 void *v_i, *v_j;
   2630                 int step_i, step_j;
   2631 
   2632                 if( !vecmat )
   2633                 {
   2634                     v_i = vecdata[i].ptr;
   2635                     v_j = vecdata[j].ptr;
   2636                     step_i = vecdata[i].step;
   2637                     step_j = vecdata[j].step;
   2638                 }
   2639                 else
   2640                 {
   2641                     v_i = vecmat->data.ptr + vec_delta*i;
   2642                     v_j = vecmat->data.ptr + vec_delta*j;
   2643                     step_i = step_j = vec_step;
   2644                 }
   2645 
   2646                 dot_func( v_i, step_i, v_j, step_j, avg->data.ptr, avg->step, srcsize, &result );
   2647 
   2648                 if( dsttype == CV_64FC1 )
   2649                 {
   2650                     ((double*)(cov->data.ptr + i*cov->step))[j] =
   2651                     ((double*)(cov->data.ptr + j*cov->step))[i] = result*scale;
   2652                 }
   2653                 else
   2654                 {
   2655                     ((float*)(cov->data.ptr + i*cov->step))[j] =
   2656                     ((float*)(cov->data.ptr + j*cov->step))[i] = (float)(result*scale);
   2657                 }
   2658             }
   2659         }
   2660     }
   2661     else
   2662     {
   2663         uchar* cov_ptr = cov->data.ptr;
   2664         int cov_step = cov->step;
   2665         int cov_size = cov->rows;
   2666         CvFunc2D_3A1P ext_func =
   2667             (CvFunc2D_3A1P)ext_tab[dsttype == CV_64FC1].fn_2d[CV_MAT_DEPTH(srctype)];
   2668         if( !ext_func )
   2669             CV_ERROR( CV_StsUnsupportedFormat,
   2670             "The format of input vectors is not supported" );
   2671 
   2672         cvZero( cov );
   2673 
   2674         for( i = 0; i < count; i++ )
   2675         {
   2676             void* v;
   2677             int vstep;
   2678             if( !vecmat )
   2679             {
   2680                 v = vecdata[i].ptr;
   2681                 vstep = vecdata[i].step;
   2682             }
   2683             else
   2684             {
   2685                 v = vecmat->data.ptr + vec_delta*i;
   2686                 vstep = vec_step;
   2687             }
   2688 
   2689             ext_func( v, vstep, avg->data.ptr, avg->step,
   2690                       cov_ptr, cov_step, srcsize, tempvec->data.ptr );
   2691         }
   2692 
   2693         if( dsttype == CV_64FC1 )
   2694             for( i = 0; i < cov_size; i++ )
   2695                 for( j = 0; j <= i; j++ )
   2696                 {
   2697                     double* cov1 = ((double*)(cov_ptr + i*cov_step)) + j;
   2698                     double* cov2 = ((double*)(cov_ptr + j*cov_step)) + i;
   2699 
   2700                     if( flags & CV_COVAR_SCALE )
   2701                         *cov1 = *cov2 = *cov1*scale;
   2702                     else
   2703                         *cov2 = *cov1;
   2704                 }
   2705         else
   2706             for( i = 0; i < cov_size; i++ )
   2707                 for( j = 0; j <= i; j++ )
   2708                 {
   2709                     float* cov1 = ((float*)(cov_ptr + i*cov_step)) + j;
   2710                     float* cov2 = ((float*)(cov_ptr + j*cov_step)) + i;
   2711 
   2712                     if( flags & CV_COVAR_SCALE )
   2713                         *cov1 = *cov2 = (float)(*cov1*scale);
   2714                     else
   2715                         *cov2 = *cov1;
   2716                 }
   2717     }
   2718 
   2719     __END__;
   2720 
   2721     cvFree( &vecdata );
   2722     cvReleaseMat( &tempvec );
   2723 }
   2724 
   2725 /****************************************************************************************\
   2726 *                                        cvMahalanobis                                   *
   2727 \****************************************************************************************/
   2728 
   2729 #define ICV_MAHALANOBIS( flavor, arrtype )                                              \
   2730 static CvStatus CV_STDCALL                                                              \
   2731 icvMahalanobis_##flavor##_C1R( const arrtype* mat, int matstep,                         \
   2732                                const arrtype* vec, int len, double* _result )           \
   2733 {                                                                                       \
   2734     int i, j;                                                                           \
   2735     double result = 0;                                                                  \
   2736                                                                                         \
   2737     matstep /= sizeof(mat[0]);                                                          \
   2738     for( i = 0; i < len; i++, mat += matstep )                                          \
   2739     {                                                                                   \
   2740         double row_sum = 0;                                                             \
   2741         for( j = 0; j <= len - 4; j += 4 )                                              \
   2742             row_sum += vec[j]*mat[j] + vec[j+1]*mat[j+1] +                              \
   2743                        vec[j+2]*mat[j+2] + vec[j+3]*mat[j+3];                           \
   2744         for( ; j < len; j++ )                                                           \
   2745             row_sum += vec[j]*mat[j];                                                   \
   2746         result += row_sum * vec[i];                                                     \
   2747     }                                                                                   \
   2748     *_result = result;                                                                  \
   2749                                                                                         \
   2750     return CV_OK;                                                                       \
   2751 }
   2752 
   2753 ICV_MAHALANOBIS( 32f, float )
   2754 ICV_MAHALANOBIS( 64f, double )
   2755 
   2756 static void  icvInitMahalanobisTable( CvFuncTable* tab )
   2757 {
   2758     tab->fn_2d[CV_32F] = (void*)icvMahalanobis_32f_C1R;
   2759     tab->fn_2d[CV_64F] = (void*)icvMahalanobis_64f_C1R;
   2760 }
   2761 
   2762 typedef CvStatus (CV_STDCALL * CvMahalanobisFunc)( const void* mat, int matstep,
   2763                                                    const void* vec, int len, double* _result );
   2764 
   2765 CV_IMPL double
   2766 cvMahalanobis( const CvArr* srcAarr, const CvArr* srcBarr, CvArr* matarr )
   2767 {
   2768     static CvFuncTable mahal_tab;
   2769     static int inittab = 0;
   2770     uchar* buffer = 0;
   2771     int local_alloc = 0;
   2772     double dist = 0;
   2773 
   2774     CV_FUNCNAME( "cvMahalanobis" );
   2775 
   2776     __BEGIN__;
   2777 
   2778     int buf_size, elem_size, len;
   2779     CvMat stubA, *srcA = (CvMat*)srcAarr;
   2780     CvMat stubB, *srcB = (CvMat*)srcBarr;
   2781     CvMat stub, *mat = (CvMat*)matarr;
   2782     CvMat temp;
   2783     CvMahalanobisFunc func;
   2784 
   2785     if( !inittab )
   2786     {
   2787         icvInitMahalanobisTable( &mahal_tab );
   2788         inittab = 1;
   2789     }
   2790 
   2791     if( !CV_IS_MAT(srcA) )
   2792         CV_CALL( srcA = cvGetMat( srcA, &stubA ));
   2793 
   2794     if( !CV_IS_MAT(srcB) )
   2795         CV_CALL( srcB = cvGetMat( srcB, &stubB ));
   2796 
   2797     if( !CV_IS_MAT(mat) )
   2798         CV_CALL( mat = cvGetMat( mat, &stub ));
   2799 
   2800     if( srcA->rows != 1 && srcA->cols != 1 )
   2801         CV_ERROR( CV_StsBadSize, "Input matrices must be 1-d vectors" );
   2802 
   2803     len = srcA->rows + srcA->cols - 1;
   2804 
   2805     if( !CV_ARE_SIZES_EQ(srcA,srcB) )
   2806         CV_ERROR( CV_StsUnmatchedSizes, "Input vectors have different sizes" );
   2807 
   2808     if( mat->rows != len || mat->cols != len )
   2809         CV_ERROR( CV_StsUnmatchedSizes, "Input vectors and covariation matrix have different sizes" );
   2810 
   2811     func = (CvMahalanobisFunc)mahal_tab.fn_2d[CV_MAT_DEPTH(srcA->type)];
   2812 
   2813     if( CV_MAT_CN(srcA->type) > 1 || !func )
   2814         CV_ERROR( CV_StsUnsupportedFormat,
   2815         "Only single-channel floating-point vectors are supported" );
   2816 
   2817     if( !CV_ARE_TYPES_EQ(srcA,srcB) || !CV_ARE_TYPES_EQ(srcA,mat) )
   2818         CV_ERROR( CV_StsUnmatchedSizes, "Input vectors have different sizes" );
   2819 
   2820     elem_size = CV_ELEM_SIZE(srcA->type);
   2821     buf_size = len*elem_size;
   2822 
   2823     if( buf_size <= CV_MAX_LOCAL_SIZE )
   2824     {
   2825         buffer = (uchar*)cvStackAlloc( buf_size );
   2826         local_alloc = 1;
   2827     }
   2828     else
   2829     {
   2830         CV_CALL( buffer = (uchar*)cvAlloc( buf_size ));
   2831     }
   2832 
   2833     temp = cvMat( srcA->rows, srcA->cols, srcA->type, buffer );
   2834     CV_CALL( cvSub( srcA, srcB, &temp ));
   2835 
   2836     IPPI_CALL( func( mat->data.ptr, mat->step, temp.data.ptr, len, &dist ));
   2837     dist = sqrt(dist);
   2838 
   2839     __END__;
   2840 
   2841     if( buffer && !local_alloc )
   2842         cvFree( &buffer );
   2843 
   2844     return  dist;
   2845 }
   2846 
   2847 
   2848 /****************************************************************************************\
   2849 *                                        cvMulTransposed                                 *
   2850 \****************************************************************************************/
   2851 
   2852 #define ICV_DEF_MULTRANS_R_FUNC( flavor, srctype, dsttype, load_macro )         \
   2853 static CvStatus CV_STDCALL                                                      \
   2854 icvMulTransposedR_##flavor( const srctype* src, int srcstep,                    \
   2855                        dsttype* dst, int dststep,                               \
   2856                        const dsttype* delta, int deltastep,                     \
   2857                        CvSize size, int delta_cols, double scale )              \
   2858 {                                                                               \
   2859     int i, j, k;                                                                \
   2860     dsttype* tdst = dst;                                                        \
   2861     dsttype* col_buf = 0;                                                       \
   2862     dsttype* delta_buf = 0;                                                     \
   2863     int local_alloc = 0;                                                        \
   2864     int buf_size = size.height*sizeof(dsttype);                                 \
   2865                                                                                 \
   2866     if( delta && delta_cols < size.width )                                      \
   2867     {                                                                           \
   2868         assert( delta_cols == 1 );                                              \
   2869         buf_size += 4*buf_size;                                                 \
   2870     }                                                                           \
   2871                                                                                 \
   2872     if( buf_size <= CV_MAX_LOCAL_SIZE )                                         \
   2873     {                                                                           \
   2874         col_buf = (dsttype*)cvStackAlloc( buf_size );                           \
   2875         local_alloc = 1;                                                        \
   2876     }                                                                           \
   2877     else                                                                        \
   2878     {                                                                           \
   2879         col_buf = (dsttype*)cvAlloc( buf_size );                                \
   2880         if( !col_buf )                                                          \
   2881             return CV_OUTOFMEM_ERR;                                             \
   2882     }                                                                           \
   2883                                                                                 \
   2884     srcstep /= sizeof(src[0]); dststep /= sizeof(dst[0]);                       \
   2885     deltastep /= sizeof(delta[0]);                                              \
   2886                                                                                 \
   2887     if( delta && delta_cols < size.width )                                      \
   2888     {                                                                           \
   2889         delta_buf = col_buf + size.height;                                      \
   2890         for( i = 0; i < size.height; i++ )                                      \
   2891             delta_buf[i*4] = delta_buf[i*4+1] =                                 \
   2892                 delta_buf[i*4+2] = delta_buf[i*4+3] = delta[i*deltastep];       \
   2893         delta = delta_buf;                                                      \
   2894         deltastep = deltastep ? 4 : 0;                                          \
   2895     }                                                                           \
   2896                                                                                 \
   2897     if( !delta )                                                                \
   2898         for( i = 0; i < size.width; i++, tdst += dststep )                      \
   2899         {                                                                       \
   2900             for( k = 0; k < size.height; k++ )                                  \
   2901                 col_buf[k] = src[k*srcstep+i];                                  \
   2902                                                                                 \
   2903             for( j = i; j <= size.width - 4; j += 4 )                           \
   2904             {                                                                   \
   2905                 double s0 = 0, s1 = 0, s2 = 0, s3 = 0;                          \
   2906                 const srctype *tsrc = src + j;                                  \
   2907                                                                                 \
   2908                 for( k = 0; k < size.height; k++, tsrc += srcstep )             \
   2909                 {                                                               \
   2910                     double a = col_buf[k];                                      \
   2911                     s0 += a * load_macro(tsrc[0]);                              \
   2912                     s1 += a * load_macro(tsrc[1]);                              \
   2913                     s2 += a * load_macro(tsrc[2]);                              \
   2914                     s3 += a * load_macro(tsrc[3]);                              \
   2915                 }                                                               \
   2916                                                                                 \
   2917                 tdst[j] = (dsttype)(s0*scale);                                  \
   2918                 tdst[j+1] = (dsttype)(s1*scale);                                \
   2919                 tdst[j+2] = (dsttype)(s2*scale);                                \
   2920                 tdst[j+3] = (dsttype)(s3*scale);                                \
   2921             }                                                                   \
   2922                                                                                 \
   2923             for( ; j < size.width; j++ )                                        \
   2924             {                                                                   \
   2925                 double s0 = 0;                                                  \
   2926                 const srctype *tsrc = src + j;                                  \
   2927                                                                                 \
   2928                 for( k = 0; k < size.height; k++, tsrc += srcstep )             \
   2929                     s0 += col_buf[k] * tsrc[0];                                 \
   2930                                                                                 \
   2931                 tdst[j] = (dsttype)(s0*scale);                                  \
   2932             }                                                                   \
   2933         }                                                                       \
   2934     else                                                                        \
   2935         for( i = 0; i < size.width; i++, tdst += dststep )                      \
   2936         {                                                                       \
   2937             if( !delta_buf )                                                    \
   2938                 for( k = 0; k < size.height; k++ )                              \
   2939                     col_buf[k] = load_macro(src[k*srcstep+i]) - delta[k*deltastep+i]; \
   2940             else                                                                \
   2941                 for( k = 0; k < size.height; k++ )                              \
   2942                     col_buf[k] = load_macro(src[k*srcstep+i]) - delta_buf[k*deltastep]; \
   2943                                                                                 \
   2944             for( j = i; j <= size.width - 4; j += 4 )                           \
   2945             {                                                                   \
   2946                 double s0 = 0, s1 = 0, s2 = 0, s3 = 0;                          \
   2947                 const srctype *tsrc = src + j;                                  \
   2948                 const dsttype *d = delta_buf ? delta_buf : delta + j;           \
   2949                                                                                 \
   2950                 for( k = 0; k < size.height; k++, tsrc+=srcstep, d+=deltastep ) \
   2951                 {                                                               \
   2952                     double a = col_buf[k];                                      \
   2953                     s0 += a * (load_macro(tsrc[0]) - d[0]);                     \
   2954                     s1 += a * (load_macro(tsrc[1]) - d[1]);                     \
   2955                     s2 += a * (load_macro(tsrc[2]) - d[2]);                     \
   2956                     s3 += a * (load_macro(tsrc[3]) - d[3]);                     \
   2957                 }                                                               \
   2958                                                                                 \
   2959                 tdst[j] = (dsttype)(s0*scale);                                  \
   2960                 tdst[j+1] = (dsttype)(s1*scale);                                \
   2961                 tdst[j+2] = (dsttype)(s2*scale);                                \
   2962                 tdst[j+3] = (dsttype)(s3*scale);                                \
   2963             }                                                                   \
   2964                                                                                 \
   2965             for( ; j < size.width; j++ )                                        \
   2966             {                                                                   \
   2967                 double s0 = 0;                                                  \
   2968                 const srctype *tsrc = src + j;                                  \
   2969                 const dsttype *d = delta_buf ? delta_buf : delta + j;           \
   2970                                                                                 \
   2971                 for( k = 0; k < size.height; k++, tsrc+=srcstep, d+=deltastep ) \
   2972                     s0 += col_buf[k] * (load_macro(tsrc[0]) - d[0]);            \
   2973                                                                                 \
   2974                 tdst[j] = (dsttype)(s0*scale);                                  \
   2975             }                                                                   \
   2976         }                                                                       \
   2977                                                                                 \
   2978     /* fill the lower part of the destination matrix */                         \
   2979     for( i = 1; i < size.width; i++ )                                           \
   2980         for( j = 0; j < i; j++ )                                                \
   2981             dst[dststep*i + j] = dst[dststep*j + i];                            \
   2982                                                                                 \
   2983     if( col_buf && !local_alloc )                                               \
   2984         cvFree( &col_buf );                                                     \
   2985                                                                                 \
   2986     return CV_NO_ERR;                                                           \
   2987 }
   2988 
   2989 
   2990 #define ICV_DEF_MULTRANS_L_FUNC( flavor, srctype, dsttype, load_macro )         \
   2991 static CvStatus CV_STDCALL                                                      \
   2992 icvMulTransposedL_##flavor( const srctype* src, int srcstep,                    \
   2993                             dsttype* dst, int dststep,                          \
   2994                             dsttype* delta, int deltastep,                      \
   2995                             CvSize size, int delta_cols, double scale )         \
   2996 {                                                                               \
   2997     int i, j, k;                                                                \
   2998     dsttype* tdst = dst;                                                        \
   2999                                                                                 \
   3000     srcstep /= sizeof(src[0]); dststep /= sizeof(dst[0]);                       \
   3001     deltastep /= sizeof(delta[0]);                                              \
   3002                                                                                 \
   3003     if( !delta )                                                                \
   3004         for( i = 0; i < size.height; i++, tdst += dststep )                     \
   3005             for( j = i; j < size.height; j++ )                                  \
   3006             {                                                                   \
   3007                 double s = 0;                                                   \
   3008                 const srctype *tsrc1 = src + i*srcstep;                         \
   3009                 const srctype *tsrc2 = src + j*srcstep;                         \
   3010                                                                                 \
   3011                 for( k = 0; k <= size.width - 4; k += 4 )                       \
   3012                     s += tsrc1[k]*tsrc2[k] + tsrc1[k+1]*tsrc2[k+1] +            \
   3013                          tsrc1[k+2]*tsrc2[k+2] + tsrc1[k+3]*tsrc2[k+3];         \
   3014                 for( ; k < size.width; k++ )                                    \
   3015                     s += tsrc1[k] * tsrc2[k];                                   \
   3016                 tdst[j] = (dsttype)(s*scale);                                   \
   3017             }                                                                   \
   3018     else                                                                        \
   3019     {                                                                           \
   3020         dsttype* row_buf = 0;                                                   \
   3021         int local_alloc = 0;                                                    \
   3022         int buf_size = size.width*sizeof(dsttype);                              \
   3023         dsttype delta_buf[4];                                                   \
   3024         int delta_shift = delta_cols == size.width ? 4 : 0;                     \
   3025                                                                                 \
   3026         if( buf_size <= CV_MAX_LOCAL_SIZE )                                     \
   3027         {                                                                       \
   3028             row_buf = (dsttype*)cvStackAlloc( buf_size );                       \
   3029             local_alloc = 1;                                                    \
   3030         }                                                                       \
   3031         else                                                                    \
   3032         {                                                                       \
   3033             row_buf = (dsttype*)cvAlloc( buf_size );                            \
   3034             if( !row_buf )                                                      \
   3035                 return CV_OUTOFMEM_ERR;                                         \
   3036         }                                                                       \
   3037                                                                                 \
   3038         for( i = 0; i < size.height; i++, tdst += dststep )                     \
   3039         {                                                                       \
   3040             const srctype *tsrc1 = src + i*srcstep;                             \
   3041             const dsttype *tdelta1 = delta + i*deltastep;                       \
   3042                                                                                 \
   3043             if( delta_cols < size.width )                                       \
   3044                 for( k = 0; k < size.width; k++ )                               \
   3045                     row_buf[k] = tsrc1[k] - tdelta1[0];                         \
   3046             else                                                                \
   3047                 for( k = 0; k < size.width; k++ )                               \
   3048                     row_buf[k] = tsrc1[k] - tdelta1[k];                         \
   3049                                                                                 \
   3050             for( j = i; j < size.height; j++ )                                  \
   3051             {                                                                   \
   3052                 double s = 0;                                                   \
   3053                 const srctype *tsrc2 = src + j*srcstep;                         \
   3054                 const dsttype *tdelta2 = delta + j*deltastep;                   \
   3055                 if( delta_cols < size.width )                                   \
   3056                 {                                                               \
   3057                     delta_buf[0] = delta_buf[1] =                               \
   3058                         delta_buf[2] = delta_buf[3] = tdelta2[0];               \
   3059                     tdelta2 = delta_buf;                                        \
   3060                 }                                                               \
   3061                 for( k = 0; k <= size.width-4; k += 4, tdelta2 += delta_shift ) \
   3062                     s += row_buf[k]*(load_macro(tsrc2[k]) - tdelta2[0]) +       \
   3063                          row_buf[k+1]*(load_macro(tsrc2[k+1]) - tdelta2[1]) +   \
   3064                          row_buf[k+2]*(load_macro(tsrc2[k+2]) - tdelta2[2]) +   \
   3065                          row_buf[k+3]*(load_macro(tsrc2[k+3]) - tdelta2[3]);    \
   3066                 for( ; k < size.width; k++, tdelta2++ )                         \
   3067                     s += row_buf[k]*(load_macro(tsrc2[k]) - tdelta2[0]);        \
   3068                 tdst[j] = (dsttype)(s*scale);                                   \
   3069             }                                                                   \
   3070         }                                                                       \
   3071                                                                                 \
   3072         if( row_buf && !local_alloc )                                           \
   3073             cvFree( &row_buf );                                                 \
   3074     }                                                                           \
   3075                                                                                 \
   3076     /* fill the lower part of the destination matrix */                         \
   3077     for( j = 0; j < size.height - 1; j++ )                                      \
   3078         for( i = j; i < size.height; i++ )                                      \
   3079             dst[dststep*i + j] = dst[dststep*j + i];                            \
   3080                                                                                 \
   3081     return CV_NO_ERR;                                                           \
   3082 }
   3083 
   3084 
   3085 ICV_DEF_MULTRANS_R_FUNC( 8u32f, uchar, float, CV_8TO32F )
   3086 ICV_DEF_MULTRANS_R_FUNC( 8u64f, uchar, double, CV_8TO32F )
   3087 ICV_DEF_MULTRANS_R_FUNC( 32f, float, float, CV_NOP )
   3088 ICV_DEF_MULTRANS_R_FUNC( 32f64f, float, double, CV_NOP )
   3089 ICV_DEF_MULTRANS_R_FUNC( 64f, double, double, CV_NOP )
   3090 ICV_DEF_MULTRANS_R_FUNC( 16u32f, ushort, float, CV_NOP )
   3091 ICV_DEF_MULTRANS_R_FUNC( 16u64f, ushort, double, CV_NOP )
   3092 ICV_DEF_MULTRANS_R_FUNC( 16s32f, short, float, CV_NOP )
   3093 ICV_DEF_MULTRANS_R_FUNC( 16s64f, short, double, CV_NOP )
   3094 
   3095 ICV_DEF_MULTRANS_L_FUNC( 8u32f, uchar, float, CV_8TO32F )
   3096 ICV_DEF_MULTRANS_L_FUNC( 8u64f, uchar, double, CV_8TO32F )
   3097 ICV_DEF_MULTRANS_L_FUNC( 32f, float, float, CV_NOP )
   3098 ICV_DEF_MULTRANS_L_FUNC( 32f64f, float, double, CV_NOP )
   3099 ICV_DEF_MULTRANS_L_FUNC( 64f, double, double, CV_NOP )
   3100 ICV_DEF_MULTRANS_L_FUNC( 16u32f, ushort, float, CV_NOP )
   3101 ICV_DEF_MULTRANS_L_FUNC( 16u64f, ushort, double, CV_NOP )
   3102 ICV_DEF_MULTRANS_L_FUNC( 16s32f, short, float, CV_NOP )
   3103 ICV_DEF_MULTRANS_L_FUNC( 16s64f, short, double, CV_NOP )
   3104 
   3105 
   3106 typedef CvStatus (CV_STDCALL * CvMulTransposedFunc)
   3107     ( const void* src, int srcstep,
   3108       void* dst, int dststep, const void* delta,
   3109       int deltastep, CvSize size, int delta_cols, double scale );
   3110 
   3111 CV_IMPL void
   3112 cvMulTransposed( const CvArr* srcarr, CvArr* dstarr,
   3113                  int order, const CvArr* deltaarr, double scale )
   3114 {
   3115     const int gemm_level = 100; // boundary above which GEMM is faster.
   3116     CvMat* src2 = 0;
   3117 
   3118     CV_FUNCNAME( "cvMulTransposed" );
   3119 
   3120     __BEGIN__;
   3121 
   3122     CvMat sstub, *src = (CvMat*)srcarr;
   3123     CvMat dstub, *dst = (CvMat*)dstarr;
   3124     CvMat deltastub, *delta = (CvMat*)deltaarr;
   3125     int stype, dtype;
   3126 
   3127     if( !CV_IS_MAT( src ))
   3128         CV_CALL( src = cvGetMat( src, &sstub ));
   3129 
   3130     if( !CV_IS_MAT( dst ))
   3131         CV_CALL( dst = cvGetMat( dst, &dstub ));
   3132 
   3133     if( delta )
   3134     {
   3135         if( !CV_IS_MAT( delta ))
   3136             CV_CALL( delta = cvGetMat( delta, &deltastub ));
   3137 
   3138         if( !CV_ARE_TYPES_EQ( dst, delta ))
   3139             CV_ERROR( CV_StsUnmatchedFormats, "" );
   3140 
   3141         if( (delta->rows != src->rows && delta->rows != 1) ||
   3142             (delta->cols != src->cols && delta->cols != 1) )
   3143             CV_ERROR( CV_StsUnmatchedSizes, "" );
   3144     }
   3145     else
   3146     {
   3147         delta = &deltastub;
   3148         delta->data.ptr = 0;
   3149         delta->step = 0;
   3150         delta->rows = delta->cols = 0;
   3151     }
   3152 
   3153     stype = CV_MAT_TYPE( src->type );
   3154     dtype = CV_MAT_TYPE( dst->type );
   3155 
   3156     if( dst->rows != dst->cols )
   3157         CV_ERROR( CV_StsBadSize, "The destination matrix must be square" );
   3158 
   3159     if( (order != 0 && src->cols != dst->cols) ||
   3160         (order == 0 && src->rows != dst->rows))
   3161         CV_ERROR( CV_StsUnmatchedSizes, "" );
   3162 
   3163     if( src->data.ptr == dst->data.ptr || (stype == dtype &&
   3164         (dst->cols >= gemm_level && dst->rows >= gemm_level &&
   3165          src->cols >= gemm_level && src->rows >= gemm_level)))
   3166     {
   3167         if( deltaarr )
   3168         {
   3169             CV_CALL( src2 = cvCreateMat( src->rows, src->cols, src->type ));
   3170             cvRepeat( delta, src2 );
   3171             cvSub( src, src2, src2 );
   3172             src = src2;
   3173         }
   3174         cvGEMM( src, src, scale, 0, 0, dst, order == 0 ? CV_GEMM_B_T : CV_GEMM_A_T );
   3175     }
   3176     else
   3177     {
   3178         CvMulTransposedFunc func =
   3179             stype == CV_8U && dtype == CV_32F ?
   3180             (order ? (CvMulTransposedFunc)icvMulTransposedR_8u32f :
   3181                     (CvMulTransposedFunc)icvMulTransposedL_8u32f) :
   3182             stype == CV_8U && dtype == CV_64F ?
   3183             (order ? (CvMulTransposedFunc)icvMulTransposedR_8u64f :
   3184                     (CvMulTransposedFunc)icvMulTransposedL_8u64f) :
   3185             stype == CV_16U && dtype == CV_32F ?
   3186             (order ? (CvMulTransposedFunc)icvMulTransposedR_16u32f :
   3187                     (CvMulTransposedFunc)icvMulTransposedL_16u32f) :
   3188             stype == CV_16U && dtype == CV_64F ?
   3189             (order ? (CvMulTransposedFunc)icvMulTransposedR_16u64f :
   3190                     (CvMulTransposedFunc)icvMulTransposedL_16u64f) :
   3191             stype == CV_16S && dtype == CV_32F ?
   3192             (order ? (CvMulTransposedFunc)icvMulTransposedR_16s32f :
   3193                     (CvMulTransposedFunc)icvMulTransposedL_16s32f) :
   3194             stype == CV_16S && dtype == CV_64F ?
   3195             (order ? (CvMulTransposedFunc)icvMulTransposedR_16s64f :
   3196                     (CvMulTransposedFunc)icvMulTransposedL_16s64f) :
   3197             stype == CV_32F && dtype == CV_32F ?
   3198             (order ? (CvMulTransposedFunc)icvMulTransposedR_32f :
   3199                     (CvMulTransposedFunc)icvMulTransposedL_32f) :
   3200             stype == CV_32F && dtype == CV_64F ?
   3201             (order ? (CvMulTransposedFunc)icvMulTransposedR_32f64f :
   3202                     (CvMulTransposedFunc)icvMulTransposedL_32f64f) :
   3203             stype == CV_64F && dtype == CV_64F ?
   3204             (order ? (CvMulTransposedFunc)icvMulTransposedR_64f :
   3205                     (CvMulTransposedFunc)icvMulTransposedL_64f) : 0;
   3206 
   3207         if( !func )
   3208             CV_ERROR( CV_StsUnsupportedFormat, "" );
   3209 
   3210         IPPI_CALL( func( src->data.ptr, src->step, dst->data.ptr, dst->step,
   3211                          delta->data.ptr, delta->step, cvGetMatSize( src ),
   3212                          delta->cols, scale ));
   3213     }
   3214 
   3215     __END__;
   3216 
   3217     if( src2 )
   3218         cvReleaseMat( &src2 );
   3219 }
   3220 
   3221 
   3222 /****************************************************************************************\
   3223 *                                        cvDotProduct                                    *
   3224 \****************************************************************************************/
   3225 
   3226 #define ICV_DEF_DOT_PROD_FUNC_2D( flavor, arrtype, temptype, sumtype )  \
   3227 static CvStatus CV_STDCALL                                              \
   3228 icvDotProduct_##flavor##_C1R( const arrtype* src1, int step1,           \
   3229                               const arrtype* src2, int step2,           \
   3230                               CvSize size, sumtype* _sum )              \
   3231 {                                                                       \
   3232     sumtype sum = 0;                                                    \
   3233     step1 /= sizeof(src1[0]); step2 /= sizeof(src2[0]);                 \
   3234                                                                         \
   3235     for( ; size.height--; src1 += step1, src2 += step2 )                \
   3236     {                                                                   \
   3237         int i;                                                          \
   3238                                                                         \
   3239         for( i = 0; i <= size.width - 4; i += 4 )                       \
   3240         {                                                               \
   3241             temptype t0 = (temptype)src1[i]*src2[i];                    \
   3242             temptype t1 = (temptype)src1[i+1]*src2[i+1];                \
   3243             t0 += (temptype)src1[i+2]*src2[i+2];                        \
   3244             t1 += (temptype)src1[i+3]*src2[i+3];                        \
   3245             sum += t0 + t1;                                             \
   3246         }                                                               \
   3247                                                                         \
   3248         for( ; i < size.width; i++ )                                    \
   3249         {                                                               \
   3250             sum += (temptype)src1[i]*src2[i];                           \
   3251         }                                                               \
   3252     }                                                                   \
   3253                                                                         \
   3254     *_sum = sum;                                                        \
   3255     return CV_OK;                                                       \
   3256 }
   3257 
   3258 
   3259 ICV_DEF_DOT_PROD_FUNC_2D( 8u, uchar, int, int64 )
   3260 ICV_DEF_DOT_PROD_FUNC_2D( 16u, ushort, int64, int64 )
   3261 ICV_DEF_DOT_PROD_FUNC_2D( 16s, short, int64, int64 )
   3262 ICV_DEF_DOT_PROD_FUNC_2D( 32s, int, double, double )
   3263 ICV_DEF_DOT_PROD_FUNC_2D( 32f, float, double, double )
   3264 ICV_DEF_DOT_PROD_FUNC_2D( 64f, double, double, double )
   3265 
   3266 #define icvDotProduct_8s_C1R 0
   3267 
   3268 CV_DEF_INIT_FUNC_TAB_2D( DotProduct, C1R )
   3269 
   3270 CV_IMPL double
   3271 cvDotProduct( const CvArr* srcAarr, const CvArr* srcBarr )
   3272 {
   3273     static CvFuncTable tab_2d;
   3274     static int inittab = 0;
   3275 
   3276     Cv64suf result;
   3277     result.f = 0;
   3278 
   3279     CV_FUNCNAME( "cvDotProduct" );
   3280 
   3281     __BEGIN__;
   3282 
   3283     CvMat stubA, *srcA = (CvMat*)srcAarr;
   3284     CvMat stubB, *srcB = (CvMat*)srcBarr;
   3285     CvSize size;
   3286     int type, depth;
   3287     CvFunc2D_2A1P func;
   3288 
   3289     if( !inittab )
   3290     {
   3291         icvInitDotProductC1RTable( &tab_2d );
   3292         inittab = 1;
   3293     }
   3294 
   3295     if( !CV_IS_MAT( srcA ))
   3296     {
   3297         int coi = 0;
   3298         CV_CALL( srcA = cvGetMat( srcA, &stubA, &coi ));
   3299         if( coi != 0 )
   3300             CV_ERROR( CV_BadCOI, "coi is not supported" );
   3301     }
   3302 
   3303     if( srcBarr == srcAarr )
   3304         srcB = srcA;
   3305     else
   3306     {
   3307         if( !CV_IS_MAT( srcB ))
   3308         {
   3309             int coi = 0;
   3310             CV_CALL( srcB = cvGetMat( srcB, &stubB, &coi ));
   3311 
   3312             if( coi != 0 )
   3313                 CV_ERROR( CV_BadCOI, "coi is not supported" );
   3314         }
   3315 
   3316         if( !CV_ARE_TYPES_EQ( srcA, srcB ))
   3317             CV_ERROR( CV_StsUnmatchedFormats, "" );
   3318 
   3319         if( !CV_ARE_SIZES_EQ( srcA, srcB ))
   3320             CV_ERROR( CV_StsUnmatchedSizes, "" );
   3321     }
   3322 
   3323     type = CV_MAT_TYPE( srcA->type );
   3324     size = cvGetMatSize( srcA );
   3325 
   3326     size.width *= CV_MAT_CN( type );
   3327     depth = CV_MAT_DEPTH( type );
   3328 
   3329     if( CV_IS_MAT_CONT( srcA->type & srcB->type ))
   3330     {
   3331         size.width *= size.height;
   3332 
   3333         if( size.width <= CV_MAX_INLINE_MAT_OP_SIZE )
   3334         {
   3335             if( depth == CV_32F )
   3336             {
   3337                 float* mA = srcA->data.fl;
   3338                 float* mB = srcB->data.fl;
   3339                 double sum = 0;
   3340                 do
   3341                     sum += (double)mA[size.width - 1]*mB[size.width - 1];
   3342                 while( --size.width );
   3343                 result.f = sum;
   3344                 EXIT;
   3345             }
   3346 
   3347             if( depth == CV_64F )
   3348             {
   3349                 double* mA = srcA->data.db;
   3350                 double* mB = srcB->data.db;
   3351                 double sum = 0;
   3352                 do
   3353                     sum += mA[size.width - 1]*mB[size.width - 1];
   3354                 while( --size.width );
   3355                 result.f = sum;
   3356                 EXIT;
   3357             }
   3358         }
   3359         size.height = 1;
   3360     }
   3361 
   3362     func = (CvFunc2D_2A1P)(tab_2d.fn_2d[depth]);
   3363     if( !func )
   3364         CV_ERROR( CV_StsUnsupportedFormat, "" );
   3365 
   3366     IPPI_CALL( func( srcA->data.ptr, srcA->step,
   3367                      srcB->data.ptr, srcB->step,
   3368                      size, &result ));
   3369 
   3370     if( depth < CV_32S )
   3371         result.f = (double)result.i;
   3372 
   3373     __END__;
   3374 
   3375     return result.f;
   3376 }
   3377 
   3378 /* End of file. */
   3379