Home | History | Annotate | Download | only in src
      1 /*M///////////////////////////////////////////////////////////////////////////////////////
      2 //
      3 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
      4 //
      5 //  By downloading, copying, installing or using the software you agree to this license.
      6 //  If you do not agree to this license, do not download, install,
      7 //  copy or use the software.
      8 //
      9 //
     10 //                        Intel License Agreement
     11 //                For Open Source Computer Vision Library
     12 //
     13 // Copyright (C) 2000, Intel Corporation, all rights reserved.
     14 // Third party copyrights are property of their respective owners.
     15 //
     16 // Redistribution and use in source and binary forms, with or without modification,
     17 // are permitted provided that the following conditions are met:
     18 //
     19 //   * Redistribution's of source code must retain the above copyright notice,
     20 //     this list of conditions and the following disclaimer.
     21 //
     22 //   * Redistribution's in binary form must reproduce the above copyright notice,
     23 //     this list of conditions and the following disclaimer in the documentation
     24 //     and/or other materials provided with the distribution.
     25 //
     26 //   * The name of Intel Corporation may not be used to endorse or promote products
     27 //     derived from this software without specific prior written permission.
     28 //
     29 // This software is provided by the copyright holders and contributors "as is" and
     30 // any express or implied warranties, including, but not limited to, the implied
     31 // warranties of merchantability and fitness for a particular purpose are disclaimed.
     32 // In no event shall the Intel Corporation or contributors be liable for any direct,
     33 // indirect, incidental, special, exemplary, or consequential damages
     34 // (including, but not limited to, procurement of substitute goods or services;
     35 // loss of use, data, or profits; or business interruption) however caused
     36 // and on any theory of liability, whether in contract, strict liability,
     37 // or tort (including negligence or otherwise) arising in any way out of
     38 // the use of this software, even if advised of the possibility of such damage.
     39 //
     40 //M*/
     41 
     42 #include "_cv.h"
     43 
     44 /*
     45  * This file includes the code, contributed by Simon Perreault
     46  * (the function icvMedianBlur_8u_CnR_O1)
     47  *
     48  * Constant-time median filtering -- http://nomis80.org/ctmf.html
     49  * Copyright (C) 2006 Simon Perreault
     50  *
     51  * Contact:
     52  *  Laboratoire de vision et systemes numeriques
     53  *  Pavillon Adrien-Pouliot
     54  *  Universite Laval
     55  *  Sainte-Foy, Quebec, Canada
     56  *  G1K 7P4
     57  *
     58  *  perreaul (at) gel.ulaval.ca
     59  */
     60 
     61 // uncomment the line below to force SSE2 mode
     62 //#define CV_SSE2 1
     63 
     64 /****************************************************************************************\
     65                                          Box Filter
     66 \****************************************************************************************/
     67 
     68 static void icvSumRow_8u32s( const uchar* src0, int* dst, void* params );
     69 static void icvSumRow_32f64f( const float* src0, double* dst, void* params );
     70 static void icvSumCol_32s8u( const int** src, uchar* dst, int dst_step,
     71                              int count, void* params );
     72 static void icvSumCol_32s16s( const int** src, short* dst, int dst_step,
     73                              int count, void* params );
     74 static void icvSumCol_32s32s( const int** src, int* dst, int dst_step,
     75                              int count, void* params );
     76 static void icvSumCol_64f32f( const double** src, float* dst, int dst_step,
     77                               int count, void* params );
     78 
     79 CvBoxFilter::CvBoxFilter()
     80 {
     81     min_depth = CV_32S;
     82     sum = 0;
     83     sum_count = 0;
     84     normalized = false;
     85 }
     86 
     87 
     88 CvBoxFilter::CvBoxFilter( int _max_width, int _src_type, int _dst_type,
     89                           bool _normalized, CvSize _ksize,
     90                           CvPoint _anchor, int _border_mode,
     91                           CvScalar _border_value )
     92 {
     93     min_depth = CV_32S;
     94     sum = 0;
     95     sum_count = 0;
     96     normalized = false;
     97     init( _max_width, _src_type, _dst_type, _normalized,
     98           _ksize, _anchor, _border_mode, _border_value );
     99 }
    100 
    101 
    102 CvBoxFilter::~CvBoxFilter()
    103 {
    104     clear();
    105 }
    106 
    107 
    108 void CvBoxFilter::init( int _max_width, int _src_type, int _dst_type,
    109                         bool _normalized, CvSize _ksize,
    110                         CvPoint _anchor, int _border_mode,
    111                         CvScalar _border_value )
    112 {
    113     CV_FUNCNAME( "CvBoxFilter::init" );
    114 
    115     __BEGIN__;
    116 
    117     sum = 0;
    118     normalized = _normalized;
    119 
    120     if( (normalized && CV_MAT_TYPE(_src_type) != CV_MAT_TYPE(_dst_type)) ||
    121         (!normalized && CV_MAT_CN(_src_type) != CV_MAT_CN(_dst_type)))
    122         CV_ERROR( CV_StsUnmatchedFormats,
    123         "In case of normalized box filter input and output must have the same type.\n"
    124         "In case of unnormalized box filter the number of input and output channels must be the same" );
    125 
    126     min_depth = CV_MAT_DEPTH(_src_type) == CV_8U ? CV_32S : CV_64F;
    127 
    128     CvBaseImageFilter::init( _max_width, _src_type, _dst_type, 1, _ksize,
    129                              _anchor, _border_mode, _border_value );
    130 
    131     scale = normalized ? 1./(ksize.width*ksize.height) : 1;
    132 
    133     if( CV_MAT_DEPTH(src_type) == CV_8U )
    134         x_func = (CvRowFilterFunc)icvSumRow_8u32s;
    135     else if( CV_MAT_DEPTH(src_type) == CV_32F )
    136         x_func = (CvRowFilterFunc)icvSumRow_32f64f;
    137     else
    138         CV_ERROR( CV_StsUnsupportedFormat, "Unknown/unsupported input image format" );
    139 
    140     if( CV_MAT_DEPTH(dst_type) == CV_8U )
    141     {
    142         if( !normalized )
    143             CV_ERROR( CV_StsBadArg, "Only normalized box filter can be used for 8u->8u transformation" );
    144         y_func = (CvColumnFilterFunc)icvSumCol_32s8u;
    145     }
    146     else if( CV_MAT_DEPTH(dst_type) == CV_16S )
    147     {
    148         if( normalized || CV_MAT_DEPTH(src_type) != CV_8U )
    149             CV_ERROR( CV_StsBadArg, "Only 8u->16s unnormalized box filter is supported in case of 16s output" );
    150         y_func = (CvColumnFilterFunc)icvSumCol_32s16s;
    151     }
    152 	else if( CV_MAT_DEPTH(dst_type) == CV_32S )
    153 	{
    154 		if( normalized || CV_MAT_DEPTH(src_type) != CV_8U )
    155 			CV_ERROR( CV_StsBadArg, "Only 8u->32s unnormalized box filter is supported in case of 32s output");
    156 
    157 		y_func = (CvColumnFilterFunc)icvSumCol_32s32s;
    158 	}
    159     else if( CV_MAT_DEPTH(dst_type) == CV_32F )
    160     {
    161         if( CV_MAT_DEPTH(src_type) != CV_32F )
    162             CV_ERROR( CV_StsBadArg, "Only 32f->32f box filter (normalized or not) is supported in case of 32f output" );
    163         y_func = (CvColumnFilterFunc)icvSumCol_64f32f;
    164     }
    165 	else{
    166 		CV_ERROR( CV_StsBadArg, "Unknown/unsupported destination image format" );
    167 	}
    168 
    169     __END__;
    170 }
    171 
    172 
    173 void CvBoxFilter::start_process( CvSlice x_range, int width )
    174 {
    175     CvBaseImageFilter::start_process( x_range, width );
    176     int i, psz = CV_ELEM_SIZE(work_type);
    177     uchar* s;
    178     buf_end -= buf_step;
    179     buf_max_count--;
    180     assert( buf_max_count >= max_ky*2 + 1 );
    181     s = sum = buf_end + cvAlign((width + ksize.width - 1)*CV_ELEM_SIZE(src_type), ALIGN);
    182     sum_count = 0;
    183 
    184     width *= psz;
    185     for( i = 0; i < width; i++ )
    186         s[i] = (uchar)0;
    187 }
    188 
    189 
    190 static void
    191 icvSumRow_8u32s( const uchar* src, int* dst, void* params )
    192 {
    193     const CvBoxFilter* state = (const CvBoxFilter*)params;
    194     int ksize = state->get_kernel_size().width;
    195     int width = state->get_width();
    196     int cn = CV_MAT_CN(state->get_src_type());
    197     int i, k;
    198 
    199     width = (width - 1)*cn; ksize *= cn;
    200 
    201     for( k = 0; k < cn; k++, src++, dst++ )
    202     {
    203         int s = 0;
    204         for( i = 0; i < ksize; i += cn )
    205             s += src[i];
    206         dst[0] = s;
    207         for( i = 0; i < width; i += cn )
    208         {
    209             s += src[i+ksize] - src[i];
    210             dst[i+cn] = s;
    211         }
    212     }
    213 }
    214 
    215 
    216 static void
    217 icvSumRow_32f64f( const float* src, double* dst, void* params )
    218 {
    219     const CvBoxFilter* state = (const CvBoxFilter*)params;
    220     int ksize = state->get_kernel_size().width;
    221     int width = state->get_width();
    222     int cn = CV_MAT_CN(state->get_src_type());
    223     int i, k;
    224 
    225     width = (width - 1)*cn; ksize *= cn;
    226 
    227     for( k = 0; k < cn; k++, src++, dst++ )
    228     {
    229         double s = 0;
    230         for( i = 0; i < ksize; i += cn )
    231             s += src[i];
    232         dst[0] = s;
    233         for( i = 0; i < width; i += cn )
    234         {
    235             s += (double)src[i+ksize] - src[i];
    236             dst[i+cn] = s;
    237         }
    238     }
    239 }
    240 
    241 
    242 static void
    243 icvSumCol_32s8u( const int** src, uchar* dst,
    244                  int dst_step, int count, void* params )
    245 {
    246 #define BLUR_SHIFT 24
    247     CvBoxFilter* state = (CvBoxFilter*)params;
    248     int ksize = state->get_kernel_size().height;
    249     int i, width = state->get_width();
    250     int cn = CV_MAT_CN(state->get_src_type());
    251     double scale = state->get_scale();
    252     int iscale = cvFloor(scale*(1 << BLUR_SHIFT));
    253     int* sum = (int*)state->get_sum_buf();
    254     int* _sum_count = state->get_sum_count_ptr();
    255     int sum_count = *_sum_count;
    256 
    257     width *= cn;
    258     src += sum_count;
    259     count += ksize - 1 - sum_count;
    260 
    261     for( ; count--; src++ )
    262     {
    263         const int* sp = src[0];
    264         if( sum_count+1 < ksize )
    265         {
    266             for( i = 0; i <= width - 2; i += 2 )
    267             {
    268                 int s0 = sum[i] + sp[i], s1 = sum[i+1] + sp[i+1];
    269                 sum[i] = s0; sum[i+1] = s1;
    270             }
    271 
    272             for( ; i < width; i++ )
    273                 sum[i] += sp[i];
    274 
    275             sum_count++;
    276         }
    277         else
    278         {
    279             const int* sm = src[-ksize+1];
    280             for( i = 0; i <= width - 2; i += 2 )
    281             {
    282                 int s0 = sum[i] + sp[i], s1 = sum[i+1] + sp[i+1];
    283                 int t0 = CV_DESCALE(s0*iscale, BLUR_SHIFT), t1 = CV_DESCALE(s1*iscale, BLUR_SHIFT);
    284                 s0 -= sm[i]; s1 -= sm[i+1];
    285                 sum[i] = s0; sum[i+1] = s1;
    286                 dst[i] = (uchar)t0; dst[i+1] = (uchar)t1;
    287             }
    288 
    289             for( ; i < width; i++ )
    290             {
    291                 int s0 = sum[i] + sp[i], t0 = CV_DESCALE(s0*iscale, BLUR_SHIFT);
    292                 sum[i] = s0 - sm[i]; dst[i] = (uchar)t0;
    293             }
    294             dst += dst_step;
    295         }
    296     }
    297 
    298     *_sum_count = sum_count;
    299 #undef BLUR_SHIFT
    300 }
    301 
    302 
    303 static void
    304 icvSumCol_32s16s( const int** src, short* dst,
    305                   int dst_step, int count, void* params )
    306 {
    307     CvBoxFilter* state = (CvBoxFilter*)params;
    308     int ksize = state->get_kernel_size().height;
    309     int ktotal = ksize*state->get_kernel_size().width;
    310     int i, width = state->get_width();
    311     int cn = CV_MAT_CN(state->get_src_type());
    312     int* sum = (int*)state->get_sum_buf();
    313     int* _sum_count = state->get_sum_count_ptr();
    314     int sum_count = *_sum_count;
    315 
    316     dst_step /= sizeof(dst[0]);
    317     width *= cn;
    318     src += sum_count;
    319     count += ksize - 1 - sum_count;
    320 
    321     for( ; count--; src++ )
    322     {
    323         const int* sp = src[0];
    324         if( sum_count+1 < ksize )
    325         {
    326             for( i = 0; i <= width - 2; i += 2 )
    327             {
    328                 int s0 = sum[i] + sp[i], s1 = sum[i+1] + sp[i+1];
    329                 sum[i] = s0; sum[i+1] = s1;
    330             }
    331 
    332             for( ; i < width; i++ )
    333                 sum[i] += sp[i];
    334 
    335             sum_count++;
    336         }
    337         else if( ktotal < 128 )
    338         {
    339             const int* sm = src[-ksize+1];
    340             for( i = 0; i <= width - 2; i += 2 )
    341             {
    342                 int s0 = sum[i] + sp[i], s1 = sum[i+1] + sp[i+1];
    343                 dst[i] = (short)s0; dst[i+1] = (short)s1;
    344                 s0 -= sm[i]; s1 -= sm[i+1];
    345                 sum[i] = s0; sum[i+1] = s1;
    346             }
    347 
    348             for( ; i < width; i++ )
    349             {
    350                 int s0 = sum[i] + sp[i];
    351                 dst[i] = (short)s0;
    352                 sum[i] = s0 - sm[i];
    353             }
    354             dst += dst_step;
    355         }
    356         else
    357         {
    358             const int* sm = src[-ksize+1];
    359             for( i = 0; i <= width - 2; i += 2 )
    360             {
    361                 int s0 = sum[i] + sp[i], s1 = sum[i+1] + sp[i+1];
    362                 dst[i] = CV_CAST_16S(s0); dst[i+1] = CV_CAST_16S(s1);
    363                 s0 -= sm[i]; s1 -= sm[i+1];
    364                 sum[i] = s0; sum[i+1] = s1;
    365             }
    366 
    367             for( ; i < width; i++ )
    368             {
    369                 int s0 = sum[i] + sp[i];
    370                 dst[i] = CV_CAST_16S(s0);
    371                 sum[i] = s0 - sm[i];
    372             }
    373             dst += dst_step;
    374         }
    375     }
    376 
    377     *_sum_count = sum_count;
    378 }
    379 
    380 static void
    381 icvSumCol_32s32s( const int** src, int * dst,
    382                   int dst_step, int count, void* params )
    383 {
    384     CvBoxFilter* state = (CvBoxFilter*)params;
    385     int ksize = state->get_kernel_size().height;
    386     int i, width = state->get_width();
    387     int cn = CV_MAT_CN(state->get_src_type());
    388     int* sum = (int*)state->get_sum_buf();
    389     int* _sum_count = state->get_sum_count_ptr();
    390     int sum_count = *_sum_count;
    391 
    392     dst_step /= sizeof(dst[0]);
    393     width *= cn;
    394     src += sum_count;
    395     count += ksize - 1 - sum_count;
    396 
    397     for( ; count--; src++ )
    398     {
    399         const int* sp = src[0];
    400         if( sum_count+1 < ksize )
    401         {
    402             for( i = 0; i <= width - 2; i += 2 )
    403             {
    404                 int s0 = sum[i] + sp[i], s1 = sum[i+1] + sp[i+1];
    405                 sum[i] = s0; sum[i+1] = s1;
    406             }
    407 
    408             for( ; i < width; i++ )
    409                 sum[i] += sp[i];
    410 
    411             sum_count++;
    412         }
    413         else
    414         {
    415             const int* sm = src[-ksize+1];
    416             for( i = 0; i <= width - 2; i += 2 )
    417             {
    418                 int s0 = sum[i] + sp[i], s1 = sum[i+1] + sp[i+1];
    419                 dst[i] = s0; dst[i+1] = s1;
    420                 s0 -= sm[i]; s1 -= sm[i+1];
    421                 sum[i] = s0; sum[i+1] = s1;
    422             }
    423 
    424             for( ; i < width; i++ )
    425             {
    426                 int s0 = sum[i] + sp[i];
    427                 dst[i] = s0;
    428                 sum[i] = s0 - sm[i];
    429             }
    430             dst += dst_step;
    431         }
    432     }
    433 
    434     *_sum_count = sum_count;
    435 }
    436 
    437 
    438 static void
    439 icvSumCol_64f32f( const double** src, float* dst,
    440                   int dst_step, int count, void* params )
    441 {
    442     CvBoxFilter* state = (CvBoxFilter*)params;
    443     int ksize = state->get_kernel_size().height;
    444     int i, width = state->get_width();
    445     int cn = CV_MAT_CN(state->get_src_type());
    446     double scale = state->get_scale();
    447     bool normalized = state->is_normalized();
    448     double* sum = (double*)state->get_sum_buf();
    449     int* _sum_count = state->get_sum_count_ptr();
    450     int sum_count = *_sum_count;
    451 
    452     dst_step /= sizeof(dst[0]);
    453     width *= cn;
    454     src += sum_count;
    455     count += ksize - 1 - sum_count;
    456 
    457     for( ; count--; src++ )
    458     {
    459         const double* sp = src[0];
    460         if( sum_count+1 < ksize )
    461         {
    462             for( i = 0; i <= width - 2; i += 2 )
    463             {
    464                 double s0 = sum[i] + sp[i], s1 = sum[i+1] + sp[i+1];
    465                 sum[i] = s0; sum[i+1] = s1;
    466             }
    467 
    468             for( ; i < width; i++ )
    469                 sum[i] += sp[i];
    470 
    471             sum_count++;
    472         }
    473         else
    474         {
    475             const double* sm = src[-ksize+1];
    476             if( normalized )
    477                 for( i = 0; i <= width - 2; i += 2 )
    478                 {
    479                     double s0 = sum[i] + sp[i], s1 = sum[i+1] + sp[i+1];
    480                     double t0 = s0*scale, t1 = s1*scale;
    481                     s0 -= sm[i]; s1 -= sm[i+1];
    482                     dst[i] = (float)t0; dst[i+1] = (float)t1;
    483                     sum[i] = s0; sum[i+1] = s1;
    484                 }
    485             else
    486                 for( i = 0; i <= width - 2; i += 2 )
    487                 {
    488                     double s0 = sum[i] + sp[i], s1 = sum[i+1] + sp[i+1];
    489                     dst[i] = (float)s0; dst[i+1] = (float)s1;
    490                     s0 -= sm[i]; s1 -= sm[i+1];
    491                     sum[i] = s0; sum[i+1] = s1;
    492                 }
    493 
    494             for( ; i < width; i++ )
    495             {
    496                 double s0 = sum[i] + sp[i], t0 = s0*scale;
    497                 sum[i] = s0 - sm[i]; dst[i] = (float)t0;
    498             }
    499             dst += dst_step;
    500         }
    501     }
    502 
    503     *_sum_count = sum_count;
    504 }
    505 
    506 
    507 /****************************************************************************************\
    508                                       Median Filter
    509 \****************************************************************************************/
    510 
    511 #define CV_MINMAX_8U(a,b) \
    512     (t = CV_FAST_CAST_8U((a) - (b)), (b) += t, a -= t)
    513 
    514 #if CV_SSE2 && !defined __SSE2__
    515 #define __SSE2__ 1
    516 #include "emmintrin.h"
    517 #endif
    518 
    519 #if defined(__VEC__) || defined(__ALTIVEC__)
    520 #include <altivec.h>
    521 #undef bool
    522 #endif
    523 
    524 #if defined(__GNUC__)
    525 #define align(x) __attribute__ ((aligned (x)))
    526 #elif CV_SSE2 && (defined(__ICL) || (_MSC_VER >= 1300))
    527 #define align(x) __declspec(align(x))
    528 #else
    529 #define align(x)
    530 #endif
    531 
    532 #if _MSC_VER >= 1200
    533 #pragma warning( disable: 4244 )
    534 #endif
    535 
    536 /**
    537  * This structure represents a two-tier histogram. The first tier (known as the
    538  * "coarse" level) is 4 bit wide and the second tier (known as the "fine" level)
    539  * is 8 bit wide. Pixels inserted in the fine level also get inserted into the
    540  * coarse bucket designated by the 4 MSBs of the fine bucket value.
    541  *
    542  * The structure is aligned on 16 bits, which is a prerequisite for SIMD
    543  * instructions. Each bucket is 16 bit wide, which means that extra care must be
    544  * taken to prevent overflow.
    545  */
    546 typedef struct align(16)
    547 {
    548     ushort coarse[16];
    549     ushort fine[16][16];
    550 } Histogram;
    551 
    552 /**
    553  * HOP is short for Histogram OPeration. This macro makes an operation \a op on
    554  * histogram \a h for pixel value \a x. It takes care of handling both levels.
    555  */
    556 #define HOP(h,x,op) \
    557     h.coarse[x>>4] op; \
    558     *((ushort*) h.fine + x) op;
    559 
    560 #define COP(c,j,x,op) \
    561     h_coarse[ 16*(n*c+j) + (x>>4) ] op; \
    562     h_fine[ 16 * (n*(16*c+(x>>4)) + j) + (x & 0xF) ] op;
    563 
    564 #if defined __SSE2__ || defined __MMX__ || defined __ALTIVEC__
    565 #define MEDIAN_HAVE_SIMD 1
    566 #else
    567 #define MEDIAN_HAVE_SIMD 0
    568 #endif
    569 
    570 /**
    571  * Adds histograms \a x and \a y and stores the result in \a y. Makes use of
    572  * SSE2, MMX or Altivec, if available.
    573  */
    574 #if defined(__SSE2__)
    575 static inline void histogram_add( const ushort x[16], ushort y[16] )
    576 {
    577     _mm_store_si128( (__m128i*) &y[0], _mm_add_epi16(
    578         _mm_load_si128((__m128i*) &y[0]), _mm_load_si128((__m128i*) &x[0] )));
    579     _mm_store_si128( (__m128i*) &y[8], _mm_add_epi16(
    580         _mm_load_si128((__m128i*) &y[8]), _mm_load_si128((__m128i*) &x[8] )));
    581 }
    582 #elif defined(__MMX__)
    583 static inline void histogram_add( const ushort x[16], ushort y[16] )
    584 {
    585     *(__m64*) &y[0]  = _mm_add_pi16( *(__m64*) &y[0],  *(__m64*) &x[0]  );
    586     *(__m64*) &y[4]  = _mm_add_pi16( *(__m64*) &y[4],  *(__m64*) &x[4]  );
    587     *(__m64*) &y[8]  = _mm_add_pi16( *(__m64*) &y[8],  *(__m64*) &x[8]  );
    588     *(__m64*) &y[12] = _mm_add_pi16( *(__m64*) &y[12], *(__m64*) &x[12] );
    589 }
    590 #elif defined(__ALTIVEC__)
    591 static inline void histogram_add( const ushort x[16], ushort y[16] )
    592 {
    593     *(vector ushort*) &y[0] = vec_add( *(vector ushort*) &y[0], *(vector ushort*) &x[0] );
    594     *(vector ushort*) &y[8] = vec_add( *(vector ushort*) &y[8], *(vector ushort*) &x[8] );
    595 }
    596 #else
    597 static inline void histogram_add( const ushort x[16], ushort y[16] )
    598 {
    599     int i;
    600     for( i = 0; i < 16; ++i )
    601         y[i] = (ushort)(y[i] + x[i]);
    602 }
    603 #endif
    604 
    605 /**
    606  * Subtracts histogram \a x from \a y and stores the result in \a y. Makes use
    607  * of SSE2, MMX or Altivec, if available.
    608  */
    609 #if defined(__SSE2__)
    610 static inline void histogram_sub( const ushort x[16], ushort y[16] )
    611 {
    612     _mm_store_si128( (__m128i*) &y[0], _mm_sub_epi16(
    613         _mm_load_si128((__m128i*) &y[0]), _mm_load_si128((__m128i*) &x[0] )));
    614     _mm_store_si128( (__m128i*) &y[8], _mm_sub_epi16(
    615         _mm_load_si128((__m128i*) &y[8]), _mm_load_si128((__m128i*) &x[8] )));
    616 }
    617 #elif defined(__MMX__)
    618 static inline void histogram_sub( const ushort x[16], ushort y[16] )
    619 {
    620     *(__m64*) &y[0]  = _mm_sub_pi16( *(__m64*) &y[0],  *(__m64*) &x[0]  );
    621     *(__m64*) &y[4]  = _mm_sub_pi16( *(__m64*) &y[4],  *(__m64*) &x[4]  );
    622     *(__m64*) &y[8]  = _mm_sub_pi16( *(__m64*) &y[8],  *(__m64*) &x[8]  );
    623     *(__m64*) &y[12] = _mm_sub_pi16( *(__m64*) &y[12], *(__m64*) &x[12] );
    624 }
    625 #elif defined(__ALTIVEC__)
    626 static inline void histogram_sub( const ushort x[16], ushort y[16] )
    627 {
    628     *(vector ushort*) &y[0] = vec_sub( *(vector ushort*) &y[0], *(vector ushort*) &x[0] );
    629     *(vector ushort*) &y[8] = vec_sub( *(vector ushort*) &y[8], *(vector ushort*) &x[8] );
    630 }
    631 #else
    632 static inline void histogram_sub( const ushort x[16], ushort y[16] )
    633 {
    634     int i;
    635     for( i = 0; i < 16; ++i )
    636         y[i] = (ushort)(y[i] - x[i]);
    637 }
    638 #endif
    639 
    640 static inline void histogram_muladd( int a, const ushort x[16],
    641         ushort y[16] )
    642 {
    643     int i;
    644     for ( i = 0; i < 16; ++i )
    645         y[i] = (ushort)(y[i] + a * x[i]);
    646 }
    647 
    648 static CvStatus CV_STDCALL
    649 icvMedianBlur_8u_CnR_O1( uchar* src, int src_step, uchar* dst, int dst_step,
    650                          CvSize size, int kernel_size, int cn, int pad_left, int pad_right )
    651 {
    652     int r = (kernel_size-1)/2;
    653     const int m = size.height, n = size.width;
    654     int i, j, k, c;
    655     const unsigned char *p, *q;
    656     Histogram H[4];
    657     ushort *h_coarse, *h_fine, luc[4][16];
    658 
    659     if( size.height < r || size.width < r )
    660         return CV_BADSIZE_ERR;
    661 
    662     assert( src );
    663     assert( dst );
    664     assert( r >= 0 );
    665     assert( size.width >= 2*r+1 );
    666     assert( size.height >= 2*r+1 );
    667     assert( src_step != 0 );
    668     assert( dst_step != 0 );
    669 
    670     h_coarse = (ushort*) cvAlloc(  1 * 16 * n * cn * sizeof(ushort) );
    671     h_fine   = (ushort*) cvAlloc( 16 * 16 * n * cn * sizeof(ushort) );
    672     memset( h_coarse, 0,  1 * 16 * n * cn * sizeof(ushort) );
    673     memset( h_fine,   0, 16 * 16 * n * cn * sizeof(ushort) );
    674 
    675     /* First row initialization */
    676     for ( j = 0; j < n; ++j ) {
    677         for ( c = 0; c < cn; ++c ) {
    678             COP( c, j, src[cn*j+c], += r+1 );
    679         }
    680     }
    681     for ( i = 0; i < r; ++i ) {
    682         for ( j = 0; j < n; ++j ) {
    683             for ( c = 0; c < cn; ++c ) {
    684                 COP( c, j, src[src_step*i+cn*j+c], ++ );
    685             }
    686         }
    687     }
    688 
    689     for ( i = 0; i < m; ++i ) {
    690 
    691         /* Update column histograms for entire row. */
    692         p = src + src_step * MAX( 0, i-r-1 );
    693         q = p + cn * n;
    694         for ( j = 0; p != q; ++j ) {
    695             for ( c = 0; c < cn; ++c, ++p ) {
    696                 COP( c, j, *p, -- );
    697             }
    698         }
    699 
    700         p = src + src_step * MIN( m-1, i+r );
    701         q = p + cn * n;
    702         for ( j = 0; p != q; ++j ) {
    703             for ( c = 0; c < cn; ++c, ++p ) {
    704                 COP( c, j, *p, ++ );
    705             }
    706         }
    707 
    708         /* First column initialization */
    709         memset( H, 0, cn*sizeof(H[0]) );
    710         memset( luc, 0, cn*sizeof(luc[0]) );
    711         if ( pad_left ) {
    712             for ( c = 0; c < cn; ++c ) {
    713                 histogram_muladd( r, &h_coarse[16*n*c], H[c].coarse );
    714             }
    715         }
    716         for ( j = 0; j < (pad_left ? r : 2*r); ++j ) {
    717             for ( c = 0; c < cn; ++c ) {
    718                 histogram_add( &h_coarse[16*(n*c+j)], H[c].coarse );
    719             }
    720         }
    721         for ( c = 0; c < cn; ++c ) {
    722             for ( k = 0; k < 16; ++k ) {
    723                 histogram_muladd( 2*r+1, &h_fine[16*n*(16*c+k)], &H[c].fine[k][0] );
    724             }
    725         }
    726 
    727         for ( j = pad_left ? 0 : r; j < (pad_right ? n : n-r); ++j ) {
    728             for ( c = 0; c < cn; ++c ) {
    729                 int t = 2*r*r + 2*r, b, sum = 0;
    730                 ushort* segment;
    731 
    732                 histogram_add( &h_coarse[16*(n*c + MIN(j+r,n-1))], H[c].coarse );
    733 
    734                 /* Find median at coarse level */
    735                 for ( k = 0; k < 16 ; ++k ) {
    736                     sum += H[c].coarse[k];
    737                     if ( sum > t ) {
    738                         sum -= H[c].coarse[k];
    739                         break;
    740                     }
    741                 }
    742                 assert( k < 16 );
    743 
    744                 /* Update corresponding histogram segment */
    745                 if ( luc[c][k] <= j-r ) {
    746                     memset( &H[c].fine[k], 0, 16 * sizeof(ushort) );
    747                     for ( luc[c][k] = j-r; luc[c][k] < MIN(j+r+1,n); ++luc[c][k] ) {
    748                         histogram_add( &h_fine[16*(n*(16*c+k)+luc[c][k])], H[c].fine[k] );
    749                     }
    750                     if ( luc[c][k] < j+r+1 ) {
    751                         histogram_muladd( j+r+1 - n, &h_fine[16*(n*(16*c+k)+(n-1))], &H[c].fine[k][0] );
    752                         luc[c][k] = (ushort)(j+r+1);
    753                     }
    754                 }
    755                 else {
    756                     for ( ; luc[c][k] < j+r+1; ++luc[c][k] ) {
    757                         histogram_sub( &h_fine[16*(n*(16*c+k)+MAX(luc[c][k]-2*r-1,0))], H[c].fine[k] );
    758                         histogram_add( &h_fine[16*(n*(16*c+k)+MIN(luc[c][k],n-1))], H[c].fine[k] );
    759                     }
    760                 }
    761 
    762                 histogram_sub( &h_coarse[16*(n*c+MAX(j-r,0))], H[c].coarse );
    763 
    764                 /* Find median in segment */
    765                 segment = H[c].fine[k];
    766                 for ( b = 0; b < 16 ; ++b ) {
    767                     sum += segment[b];
    768                     if ( sum > t ) {
    769                         dst[dst_step*i+cn*j+c] = (uchar)(16*k + b);
    770                         break;
    771                     }
    772                 }
    773                 assert( b < 16 );
    774             }
    775         }
    776     }
    777 
    778 #if defined(__MMX__)
    779     _mm_empty();
    780 #endif
    781 
    782     cvFree(&h_coarse);
    783     cvFree(&h_fine);
    784 
    785 #undef HOP
    786 #undef COP
    787     return CV_OK;
    788 }
    789 
    790 
    791 #if _MSC_VER >= 1200
    792 #pragma warning( default: 4244 )
    793 #endif
    794 
    795 
    796 static CvStatus CV_STDCALL
    797 icvMedianBlur_8u_CnR_Om( uchar* src, int src_step, uchar* dst, int dst_step,
    798                          CvSize size, int m, int cn )
    799 {
    800     #define N  16
    801     int     zone0[4][N];
    802     int     zone1[4][N*N];
    803     int     x, y;
    804     int     n2 = m*m/2;
    805     int     nx = (m + 1)/2 - 1;
    806     uchar*  src_max = src + size.height*src_step;
    807     uchar*  src_right = src + size.width*cn;
    808 
    809     #define UPDATE_ACC01( pix, cn, op ) \
    810     {                                   \
    811         int p = (pix);                  \
    812         zone1[cn][p] op;                \
    813         zone0[cn][p >> 4] op;           \
    814     }
    815 
    816     if( size.height < nx || size.width < nx )
    817         return CV_BADSIZE_ERR;
    818 
    819     if( m == 3 )
    820     {
    821         size.width *= cn;
    822 
    823         for( y = 0; y < size.height; y++, dst += dst_step )
    824         {
    825             const uchar* src0 = src + src_step*(y-1);
    826             const uchar* src1 = src0 + src_step;
    827             const uchar* src2 = src1 + src_step;
    828             if( y == 0 )
    829                 src0 = src1;
    830             else if( y == size.height - 1 )
    831                 src2 = src1;
    832 
    833             for( x = 0; x < 2*cn; x++ )
    834             {
    835                 int x0 = x < cn ? x : size.width - 3*cn + x;
    836                 int x2 = x < cn ? x + cn : size.width - 2*cn + x;
    837                 int x1 = x < cn ? x0 : x2, t;
    838 
    839                 int p0 = src0[x0], p1 = src0[x1], p2 = src0[x2];
    840                 int p3 = src1[x0], p4 = src1[x1], p5 = src1[x2];
    841                 int p6 = src2[x0], p7 = src2[x1], p8 = src2[x2];
    842 
    843                 CV_MINMAX_8U(p1, p2); CV_MINMAX_8U(p4, p5);
    844                 CV_MINMAX_8U(p7, p8); CV_MINMAX_8U(p0, p1);
    845                 CV_MINMAX_8U(p3, p4); CV_MINMAX_8U(p6, p7);
    846                 CV_MINMAX_8U(p1, p2); CV_MINMAX_8U(p4, p5);
    847                 CV_MINMAX_8U(p7, p8); CV_MINMAX_8U(p0, p3);
    848                 CV_MINMAX_8U(p5, p8); CV_MINMAX_8U(p4, p7);
    849                 CV_MINMAX_8U(p3, p6); CV_MINMAX_8U(p1, p4);
    850                 CV_MINMAX_8U(p2, p5); CV_MINMAX_8U(p4, p7);
    851                 CV_MINMAX_8U(p4, p2); CV_MINMAX_8U(p6, p4);
    852                 CV_MINMAX_8U(p4, p2);
    853                 dst[x1] = (uchar)p4;
    854             }
    855 
    856             for( x = cn; x < size.width - cn; x++ )
    857             {
    858                 int p0 = src0[x-cn], p1 = src0[x], p2 = src0[x+cn];
    859                 int p3 = src1[x-cn], p4 = src1[x], p5 = src1[x+cn];
    860                 int p6 = src2[x-cn], p7 = src2[x], p8 = src2[x+cn];
    861                 int t;
    862 
    863                 CV_MINMAX_8U(p1, p2); CV_MINMAX_8U(p4, p5);
    864                 CV_MINMAX_8U(p7, p8); CV_MINMAX_8U(p0, p1);
    865                 CV_MINMAX_8U(p3, p4); CV_MINMAX_8U(p6, p7);
    866                 CV_MINMAX_8U(p1, p2); CV_MINMAX_8U(p4, p5);
    867                 CV_MINMAX_8U(p7, p8); CV_MINMAX_8U(p0, p3);
    868                 CV_MINMAX_8U(p5, p8); CV_MINMAX_8U(p4, p7);
    869                 CV_MINMAX_8U(p3, p6); CV_MINMAX_8U(p1, p4);
    870                 CV_MINMAX_8U(p2, p5); CV_MINMAX_8U(p4, p7);
    871                 CV_MINMAX_8U(p4, p2); CV_MINMAX_8U(p6, p4);
    872                 CV_MINMAX_8U(p4, p2);
    873 
    874                 dst[x] = (uchar)p4;
    875             }
    876         }
    877 
    878         return CV_OK;
    879     }
    880 
    881     for( x = 0; x < size.width; x++, dst += cn )
    882     {
    883         uchar* dst_cur = dst;
    884         uchar* src_top = src;
    885         uchar* src_bottom = src;
    886         int    k, c;
    887         int    x0 = -1;
    888         int    src_step1 = src_step, dst_step1 = dst_step;
    889 
    890         if( x % 2 != 0 )
    891         {
    892             src_bottom = src_top += src_step*(size.height-1);
    893             dst_cur += dst_step*(size.height-1);
    894             src_step1 = -src_step1;
    895             dst_step1 = -dst_step1;
    896         }
    897 
    898         if( x <= m/2 )
    899             nx++;
    900 
    901         if( nx < m )
    902             x0 = x < m/2 ? 0 : (nx-1)*cn;
    903 
    904         // init accumulator
    905         memset( zone0, 0, sizeof(zone0[0])*cn );
    906         memset( zone1, 0, sizeof(zone1[0])*cn );
    907 
    908         for( y = 0; y <= m/2; y++ )
    909         {
    910             for( c = 0; c < cn; c++ )
    911             {
    912                 if( y > 0 )
    913                 {
    914                     if( x0 >= 0 )
    915                         UPDATE_ACC01( src_bottom[x0+c], c, += (m - nx) );
    916                     for( k = 0; k < nx*cn; k += cn )
    917                         UPDATE_ACC01( src_bottom[k+c], c, ++ );
    918                 }
    919                 else
    920                 {
    921                     if( x0 >= 0 )
    922                         UPDATE_ACC01( src_bottom[x0+c], c, += (m - nx)*(m/2+1) );
    923                     for( k = 0; k < nx*cn; k += cn )
    924                         UPDATE_ACC01( src_bottom[k+c], c, += m/2+1 );
    925                 }
    926             }
    927 
    928             if( (src_step1 > 0 && y < size.height-1) ||
    929                 (src_step1 < 0 && size.height-y-1 > 0) )
    930                 src_bottom += src_step1;
    931         }
    932 
    933         for( y = 0; y < size.height; y++, dst_cur += dst_step1 )
    934         {
    935             // find median
    936             for( c = 0; c < cn; c++ )
    937             {
    938                 int s = 0;
    939                 for( k = 0; ; k++ )
    940                 {
    941                     int t = s + zone0[c][k];
    942                     if( t > n2 ) break;
    943                     s = t;
    944                 }
    945 
    946                 for( k *= N; ;k++ )
    947                 {
    948                     s += zone1[c][k];
    949                     if( s > n2 ) break;
    950                 }
    951 
    952                 dst_cur[c] = (uchar)k;
    953             }
    954 
    955             if( y+1 == size.height )
    956                 break;
    957 
    958             if( cn == 1 )
    959             {
    960                 for( k = 0; k < nx; k++ )
    961                 {
    962                     int p = src_top[k];
    963                     int q = src_bottom[k];
    964                     zone1[0][p]--;
    965                     zone0[0][p>>4]--;
    966                     zone1[0][q]++;
    967                     zone0[0][q>>4]++;
    968                 }
    969             }
    970             else if( cn == 3 )
    971             {
    972                 for( k = 0; k < nx*3; k += 3 )
    973                 {
    974                     UPDATE_ACC01( src_top[k], 0, -- );
    975                     UPDATE_ACC01( src_top[k+1], 1, -- );
    976                     UPDATE_ACC01( src_top[k+2], 2, -- );
    977 
    978                     UPDATE_ACC01( src_bottom[k], 0, ++ );
    979                     UPDATE_ACC01( src_bottom[k+1], 1, ++ );
    980                     UPDATE_ACC01( src_bottom[k+2], 2, ++ );
    981                 }
    982             }
    983             else
    984             {
    985                 assert( cn == 4 );
    986                 for( k = 0; k < nx*4; k += 4 )
    987                 {
    988                     UPDATE_ACC01( src_top[k], 0, -- );
    989                     UPDATE_ACC01( src_top[k+1], 1, -- );
    990                     UPDATE_ACC01( src_top[k+2], 2, -- );
    991                     UPDATE_ACC01( src_top[k+3], 3, -- );
    992 
    993                     UPDATE_ACC01( src_bottom[k], 0, ++ );
    994                     UPDATE_ACC01( src_bottom[k+1], 1, ++ );
    995                     UPDATE_ACC01( src_bottom[k+2], 2, ++ );
    996                     UPDATE_ACC01( src_bottom[k+3], 3, ++ );
    997                 }
    998             }
    999 
   1000             if( x0 >= 0 )
   1001             {
   1002                 for( c = 0; c < cn; c++ )
   1003                 {
   1004                     UPDATE_ACC01( src_top[x0+c], c, -= (m - nx) );
   1005                     UPDATE_ACC01( src_bottom[x0+c], c, += (m - nx) );
   1006                 }
   1007             }
   1008 
   1009             if( (src_step1 > 0 && src_bottom + src_step1 < src_max) ||
   1010                 (src_step1 < 0 && src_bottom + src_step1 >= src) )
   1011                 src_bottom += src_step1;
   1012 
   1013             if( y >= m/2 )
   1014                 src_top += src_step1;
   1015         }
   1016 
   1017         if( x >= m/2 )
   1018             src += cn;
   1019         if( src + nx*cn > src_right ) nx--;
   1020     }
   1021 #undef N
   1022 #undef UPDATE_ACC
   1023     return CV_OK;
   1024 }
   1025 
   1026 
   1027 /****************************************************************************************\
   1028                                    Bilateral Filtering
   1029 \****************************************************************************************/
   1030 
   1031 static void
   1032 icvBilateralFiltering_8u( const CvMat* src, CvMat* dst, int d,
   1033                           double sigma_color, double sigma_space )
   1034 {
   1035     CvMat* temp = 0;
   1036     float* color_weight = 0;
   1037     float* space_weight = 0;
   1038     int* space_ofs = 0;
   1039 
   1040     CV_FUNCNAME( "icvBilateralFiltering_8u" );
   1041 
   1042     __BEGIN__;
   1043 
   1044     double gauss_color_coeff = -0.5/(sigma_color*sigma_color);
   1045     double gauss_space_coeff = -0.5/(sigma_space*sigma_space);
   1046     int cn = CV_MAT_CN(src->type);
   1047     int i, j, k, maxk, radius;
   1048     CvSize size = cvGetMatSize(src);
   1049 
   1050     if( (CV_MAT_TYPE(src->type) != CV_8UC1 &&
   1051         CV_MAT_TYPE(src->type) != CV_8UC3) ||
   1052         !CV_ARE_TYPES_EQ(src, dst) )
   1053         CV_ERROR( CV_StsUnsupportedFormat,
   1054         "Both source and destination must be 8-bit, single-channel or 3-channel images" );
   1055 
   1056     if( sigma_color <= 0 )
   1057         sigma_color = 1;
   1058     if( sigma_space <= 0 )
   1059         sigma_space = 1;
   1060 
   1061     if( d == 0 )
   1062         radius = cvRound(sigma_space*1.5);
   1063     else
   1064         radius = d/2;
   1065     radius = MAX(radius, 1);
   1066     d = radius*2 + 1;
   1067 
   1068     CV_CALL( temp = cvCreateMat( src->rows + radius*2,
   1069         src->cols + radius*2, src->type ));
   1070     CV_CALL( cvCopyMakeBorder( src, temp, cvPoint(radius,radius), IPL_BORDER_REPLICATE ));
   1071     CV_CALL( color_weight = (float*)cvAlloc(cn*256*sizeof(color_weight[0])));
   1072     CV_CALL( space_weight = (float*)cvAlloc(d*d*sizeof(space_weight[0])));
   1073     CV_CALL( space_ofs = (int*)cvAlloc(d*d*sizeof(space_ofs[0])));
   1074 
   1075     // initialize color-related bilateral filter coefficients
   1076     for( i = 0; i < 256*cn; i++ )
   1077         color_weight[i] = (float)exp(i*i*gauss_color_coeff);
   1078 
   1079     // initialize space-related bilateral filter coefficients
   1080     for( i = -radius, maxk = 0; i <= radius; i++ )
   1081         for( j = -radius; j <= radius; j++ )
   1082         {
   1083             double r = sqrt((double)i*i + (double)j*j);
   1084             if( r > radius )
   1085                 continue;
   1086             space_weight[maxk] = (float)exp(r*r*gauss_space_coeff);
   1087             space_ofs[maxk++] = i*temp->step + j*cn;
   1088         }
   1089 
   1090     for( i = 0; i < size.height; i++ )
   1091     {
   1092         const uchar* sptr = temp->data.ptr + (i+radius)*temp->step + radius*cn;
   1093         uchar* dptr = dst->data.ptr + i*dst->step;
   1094 
   1095         if( cn == 1 )
   1096         {
   1097             for( j = 0; j < size.width; j++ )
   1098             {
   1099                 float sum = 0, wsum = 0;
   1100                 int val0 = sptr[j];
   1101                 for( k = 0; k < maxk; k++ )
   1102                 {
   1103                     int val = sptr[j + space_ofs[k]];
   1104                     float w = space_weight[k]*color_weight[abs(val - val0)];
   1105                     sum += val*w;
   1106                     wsum += w;
   1107                 }
   1108                 // overflow is not possible here => there is no need to use CV_CAST_8U
   1109                 dptr[j] = (uchar)cvRound(sum/wsum);
   1110             }
   1111         }
   1112         else
   1113         {
   1114             assert( cn == 3 );
   1115             for( j = 0; j < size.width*3; j += 3 )
   1116             {
   1117                 float sum_b = 0, sum_g = 0, sum_r = 0, wsum = 0;
   1118                 int b0 = sptr[j], g0 = sptr[j+1], r0 = sptr[j+2];
   1119                 for( k = 0; k < maxk; k++ )
   1120                 {
   1121                     const uchar* sptr_k = sptr + j + space_ofs[k];
   1122                     int b = sptr_k[0], g = sptr_k[1], r = sptr_k[2];
   1123                     float w = space_weight[k]*color_weight[abs(b - b0) +
   1124                         abs(g - g0) + abs(r - r0)];
   1125                     sum_b += b*w; sum_g += g*w; sum_r += r*w;
   1126                     wsum += w;
   1127                 }
   1128                 wsum = 1.f/wsum;
   1129                 b0 = cvRound(sum_b*wsum);
   1130                 g0 = cvRound(sum_g*wsum);
   1131                 r0 = cvRound(sum_r*wsum);
   1132                 dptr[j] = (uchar)b0; dptr[j+1] = (uchar)g0; dptr[j+2] = (uchar)r0;
   1133             }
   1134         }
   1135     }
   1136 
   1137     __END__;
   1138 
   1139     cvReleaseMat( &temp );
   1140     cvFree( &color_weight );
   1141     cvFree( &space_weight );
   1142     cvFree( &space_ofs );
   1143 }
   1144 
   1145 
   1146 static void icvBilateralFiltering_32f( const CvMat* src, CvMat* dst, int d,
   1147                                        double sigma_color, double sigma_space )
   1148 {
   1149   	CvMat* temp = 0;
   1150     float* space_weight = 0;
   1151     int* space_ofs = 0;
   1152     float *expLUT = 0;
   1153 
   1154     CV_FUNCNAME( "icvBilateralFiltering_32f" );
   1155 
   1156     __BEGIN__;
   1157 
   1158     double gauss_color_coeff = -0.5/(sigma_color*sigma_color);
   1159     double gauss_space_coeff = -0.5/(sigma_space*sigma_space);
   1160     int cn = CV_MAT_CN(src->type);
   1161     int i, j, k, maxk, radius;
   1162     double minValSrc=-1, maxValSrc=1;
   1163     const int kExpNumBinsPerChannel = 1 << 12;
   1164     int kExpNumBins = 0;
   1165     float lastExpVal = 1.f;
   1166     int temp_step;
   1167     float len, scale_index;
   1168     CvMat src_reshaped;
   1169 
   1170     CvSize size = cvGetMatSize(src);
   1171 
   1172     if( (CV_MAT_TYPE(src->type) != CV_32FC1 &&
   1173         CV_MAT_TYPE(src->type) != CV_32FC3) ||
   1174         !CV_ARE_TYPES_EQ(src, dst) )
   1175         CV_ERROR( CV_StsUnsupportedFormat,
   1176         "Both source and destination must be 32-bit float, single-channel or 3-channel images" );
   1177 
   1178     if( sigma_color <= 0 )
   1179         sigma_color = 1;
   1180     if( sigma_space <= 0 )
   1181         sigma_space = 1;
   1182 
   1183     if( d == 0 )
   1184         radius = cvRound(sigma_space*1.5);
   1185     else
   1186         radius = d/2;
   1187     radius = MAX(radius, 1);
   1188     d = radius*2 + 1;
   1189     // compute the min/max range for the input image (even if multichannel)
   1190 
   1191     CV_CALL( cvReshape( src, &src_reshaped, 1 ) );
   1192     CV_CALL( cvMinMaxLoc(&src_reshaped, &minValSrc, &maxValSrc) );
   1193 
   1194     // temporary copy of the image with borders for easy processing
   1195     CV_CALL( temp = cvCreateMat( src->rows + radius*2,
   1196         src->cols + radius*2, src->type ));
   1197     temp_step = temp->step/sizeof(float);
   1198     CV_CALL( cvCopyMakeBorder( src, temp, cvPoint(radius,radius), IPL_BORDER_REPLICATE ));
   1199     // allocate lookup tables
   1200     CV_CALL( space_weight = (float*)cvAlloc(d*d*sizeof(space_weight[0])));
   1201     CV_CALL( space_ofs = (int*)cvAlloc(d*d*sizeof(space_ofs[0])));
   1202 
   1203     // assign a length which is slightly more than needed
   1204     len = (float)(maxValSrc - minValSrc) * cn;
   1205     kExpNumBins = kExpNumBinsPerChannel * cn;
   1206     CV_CALL( expLUT = (float*)cvAlloc((kExpNumBins+2) * sizeof(expLUT[0])));
   1207     scale_index = kExpNumBins/len;
   1208 
   1209     // initialize the exp LUT
   1210     for( i = 0; i < kExpNumBins+2; i++ )
   1211     {
   1212         if( lastExpVal > 0.f )
   1213         {
   1214             double val =  i / scale_index;
   1215             expLUT[i] = (float)exp(val * val * gauss_color_coeff);
   1216             lastExpVal = expLUT[i];
   1217         }
   1218         else
   1219             expLUT[i] = 0.f;
   1220     }
   1221 
   1222     // initialize space-related bilateral filter coefficients
   1223     for( i = -radius, maxk = 0; i <= radius; i++ )
   1224         for( j = -radius; j <= radius; j++ )
   1225         {
   1226             double r = sqrt((double)i*i + (double)j*j);
   1227             if( r > radius )
   1228                 continue;
   1229             space_weight[maxk] = (float)exp(r*r*gauss_space_coeff);
   1230             space_ofs[maxk++] = i*temp_step + j*cn;
   1231         }
   1232 
   1233     for( i = 0; i < size.height; i++ )
   1234     {
   1235 	    const float* sptr = temp->data.fl + (i+radius)*temp_step + radius*cn;
   1236         float* dptr = (float*)(dst->data.ptr + i*dst->step);
   1237 
   1238         if( cn == 1 )
   1239         {
   1240             for( j = 0; j < size.width; j++ )
   1241             {
   1242                 float sum = 0, wsum = 0;
   1243                 float val0 = sptr[j];
   1244                 for( k = 0; k < maxk; k++ )
   1245                 {
   1246                     float val = sptr[j + space_ofs[k]];
   1247 					float alpha = (float)(fabs(val - val0)*scale_index);
   1248                     int idx = cvFloor(alpha);
   1249                     alpha -= idx;
   1250                     float w = space_weight[k]*(expLUT[idx] + alpha*(expLUT[idx+1] - expLUT[idx]));
   1251 	                sum += val*w;
   1252                     wsum += w;
   1253                 }
   1254                 dptr[j] = (float)(sum/wsum);
   1255             }
   1256         }
   1257         else
   1258         {
   1259             assert( cn == 3 );
   1260             for( j = 0; j < size.width*3; j += 3 )
   1261             {
   1262                 float sum_b = 0, sum_g = 0, sum_r = 0, wsum = 0;
   1263                 float b0 = sptr[j], g0 = sptr[j+1], r0 = sptr[j+2];
   1264                 for( k = 0; k < maxk; k++ )
   1265                 {
   1266                     const float* sptr_k = sptr + j + space_ofs[k];
   1267                     float b = sptr_k[0], g = sptr_k[1], r = sptr_k[2];
   1268 					float alpha = (float)((fabs(b - b0) + fabs(g - g0) + fabs(r - r0))*scale_index);
   1269                     int idx = cvFloor(alpha);
   1270                     alpha -= idx;
   1271                     float w = space_weight[k]*(expLUT[idx] + alpha*(expLUT[idx+1] - expLUT[idx]));
   1272                     sum_b += b*w; sum_g += g*w; sum_r += r*w;
   1273                     wsum += w;
   1274                 }
   1275                 wsum = 1.f/wsum;
   1276                 b0 = sum_b*wsum;
   1277                 g0 = sum_g*wsum;
   1278                 r0 = sum_r*wsum;
   1279                 dptr[j] = b0; dptr[j+1] = g0; dptr[j+2] = r0;
   1280             }
   1281         }
   1282     }
   1283 
   1284     __END__;
   1285 
   1286     cvReleaseMat( &temp );
   1287     cvFree( &space_weight );
   1288     cvFree( &space_ofs );
   1289     cvFree( &expLUT );
   1290 }
   1291 
   1292 //////////////////////////////// IPP smoothing functions /////////////////////////////////
   1293 
   1294 icvFilterMedian_8u_C1R_t icvFilterMedian_8u_C1R_p = 0;
   1295 icvFilterMedian_8u_C3R_t icvFilterMedian_8u_C3R_p = 0;
   1296 icvFilterMedian_8u_C4R_t icvFilterMedian_8u_C4R_p = 0;
   1297 
   1298 icvFilterBox_8u_C1R_t icvFilterBox_8u_C1R_p = 0;
   1299 icvFilterBox_8u_C3R_t icvFilterBox_8u_C3R_p = 0;
   1300 icvFilterBox_8u_C4R_t icvFilterBox_8u_C4R_p = 0;
   1301 icvFilterBox_32f_C1R_t icvFilterBox_32f_C1R_p = 0;
   1302 icvFilterBox_32f_C3R_t icvFilterBox_32f_C3R_p = 0;
   1303 icvFilterBox_32f_C4R_t icvFilterBox_32f_C4R_p = 0;
   1304 
   1305 typedef CvStatus (CV_STDCALL * CvSmoothFixedIPPFunc)
   1306 ( const void* src, int srcstep, void* dst, int dststep,
   1307   CvSize size, CvSize ksize, CvPoint anchor );
   1308 
   1309 //////////////////////////////////////////////////////////////////////////////////////////
   1310 
   1311 CV_IMPL void
   1312 cvSmooth( const void* srcarr, void* dstarr, int smooth_type,
   1313           int param1, int param2, double param3, double param4 )
   1314 {
   1315     CvBoxFilter box_filter;
   1316     CvSepFilter gaussian_filter;
   1317 
   1318     CvMat* temp = 0;
   1319 
   1320     CV_FUNCNAME( "cvSmooth" );
   1321 
   1322     __BEGIN__;
   1323 
   1324     int coi1 = 0, coi2 = 0;
   1325     CvMat srcstub, *src = (CvMat*)srcarr;
   1326     CvMat dststub, *dst = (CvMat*)dstarr;
   1327     CvSize size;
   1328     int src_type, dst_type, depth, cn;
   1329     double sigma1 = 0, sigma2 = 0;
   1330     bool have_ipp = icvFilterMedian_8u_C1R_p != 0;
   1331 
   1332     CV_CALL( src = cvGetMat( src, &srcstub, &coi1 ));
   1333     CV_CALL( dst = cvGetMat( dst, &dststub, &coi2 ));
   1334 
   1335     if( coi1 != 0 || coi2 != 0 )
   1336         CV_ERROR( CV_BadCOI, "" );
   1337 
   1338     src_type = CV_MAT_TYPE( src->type );
   1339     dst_type = CV_MAT_TYPE( dst->type );
   1340     depth = CV_MAT_DEPTH(src_type);
   1341     cn = CV_MAT_CN(src_type);
   1342     size = cvGetMatSize(src);
   1343 
   1344     if( !CV_ARE_SIZES_EQ( src, dst ))
   1345         CV_ERROR( CV_StsUnmatchedSizes, "" );
   1346 
   1347     if( smooth_type != CV_BLUR_NO_SCALE && !CV_ARE_TYPES_EQ( src, dst ))
   1348         CV_ERROR( CV_StsUnmatchedFormats,
   1349         "The specified smoothing algorithm requires input and ouput arrays be of the same type" );
   1350 
   1351     if( smooth_type == CV_BLUR || smooth_type == CV_BLUR_NO_SCALE ||
   1352         smooth_type == CV_GAUSSIAN || smooth_type == CV_MEDIAN )
   1353     {
   1354         // automatic detection of kernel size from sigma
   1355         if( smooth_type == CV_GAUSSIAN )
   1356         {
   1357             sigma1 = param3;
   1358             sigma2 = param4 ? param4 : param3;
   1359 
   1360             if( param1 == 0 && sigma1 > 0 )
   1361                 param1 = cvRound(sigma1*(depth == CV_8U ? 3 : 4)*2 + 1)|1;
   1362             if( param2 == 0 && sigma2 > 0 )
   1363                 param2 = cvRound(sigma2*(depth == CV_8U ? 3 : 4)*2 + 1)|1;
   1364         }
   1365 
   1366         if( param2 == 0 )
   1367             param2 = size.height == 1 ? 1 : param1;
   1368         if( param1 < 1 || (param1 & 1) == 0 || param2 < 1 || (param2 & 1) == 0 )
   1369             CV_ERROR( CV_StsOutOfRange,
   1370                 "Both mask width and height must be >=1 and odd" );
   1371 
   1372         if( param1 == 1 && param2 == 1 )
   1373         {
   1374             cvConvert( src, dst );
   1375             EXIT;
   1376         }
   1377     }
   1378 
   1379     if( have_ipp && (smooth_type == CV_BLUR || (smooth_type == CV_MEDIAN && param1 <= 15)) &&
   1380         size.width >= param1 && size.height >= param2 && param1 > 1 && param2 > 1 )
   1381     {
   1382         CvSmoothFixedIPPFunc ipp_median_box_func = 0;
   1383 
   1384         if( smooth_type == CV_BLUR )
   1385         {
   1386             ipp_median_box_func =
   1387                 src_type == CV_8UC1 ? icvFilterBox_8u_C1R_p :
   1388                 src_type == CV_8UC3 ? icvFilterBox_8u_C3R_p :
   1389                 src_type == CV_8UC4 ? icvFilterBox_8u_C4R_p :
   1390                 src_type == CV_32FC1 ? icvFilterBox_32f_C1R_p :
   1391                 src_type == CV_32FC3 ? icvFilterBox_32f_C3R_p :
   1392                 src_type == CV_32FC4 ? icvFilterBox_32f_C4R_p : 0;
   1393         }
   1394         else if( smooth_type == CV_MEDIAN )
   1395         {
   1396             ipp_median_box_func =
   1397                 src_type == CV_8UC1 ? icvFilterMedian_8u_C1R_p :
   1398                 src_type == CV_8UC3 ? icvFilterMedian_8u_C3R_p :
   1399                 src_type == CV_8UC4 ? icvFilterMedian_8u_C4R_p : 0;
   1400         }
   1401 
   1402         if( ipp_median_box_func )
   1403         {
   1404             CvSize el_size = { param1, param2 };
   1405             CvPoint el_anchor = { param1/2, param2/2 };
   1406             int stripe_size = 1 << 14; // the optimal value may depend on CPU cache,
   1407                                        // overhead of the current IPP code etc.
   1408             const uchar* shifted_ptr;
   1409             int y, dy = 0;
   1410             int temp_step, dst_step = dst->step;
   1411 
   1412             CV_CALL( temp = icvIPPFilterInit( src, stripe_size, el_size ));
   1413 
   1414             shifted_ptr = temp->data.ptr +
   1415                 el_anchor.y*temp->step + el_anchor.x*CV_ELEM_SIZE(src_type);
   1416             temp_step = temp->step ? temp->step : CV_STUB_STEP;
   1417 
   1418             for( y = 0; y < src->rows; y += dy )
   1419             {
   1420                 dy = icvIPPFilterNextStripe( src, temp, y, el_size, el_anchor );
   1421                 IPPI_CALL( ipp_median_box_func( shifted_ptr, temp_step,
   1422                     dst->data.ptr + y*dst_step, dst_step, cvSize(src->cols, dy),
   1423                     el_size, el_anchor ));
   1424             }
   1425             EXIT;
   1426         }
   1427     }
   1428 
   1429     if( smooth_type == CV_BLUR || smooth_type == CV_BLUR_NO_SCALE )
   1430     {
   1431         CV_CALL( box_filter.init( src->cols, src_type, dst_type,
   1432             smooth_type == CV_BLUR, cvSize(param1, param2) ));
   1433         CV_CALL( box_filter.process( src, dst ));
   1434     }
   1435     else if( smooth_type == CV_MEDIAN )
   1436     {
   1437         int img_size_mp = size.width*size.height;
   1438         img_size_mp = (img_size_mp + (1<<19)) >> 20;
   1439 
   1440         if( depth != CV_8U || (cn != 1 && cn != 3 && cn != 4) )
   1441             CV_ERROR( CV_StsUnsupportedFormat,
   1442             "Median filter only supports 8uC1, 8uC3 and 8uC4 images" );
   1443 
   1444         if( size.width < param1*2 || size.height < param1*2 ||
   1445             param1 <= 3 + (img_size_mp < 1 ? 12 : img_size_mp < 4 ? 6 : 2)*(MEDIAN_HAVE_SIMD ? 1 : 3))
   1446         {
   1447             // Special case optimized for 3x3
   1448             IPPI_CALL( icvMedianBlur_8u_CnR_Om( src->data.ptr, src->step,
   1449                 dst->data.ptr, dst->step, size, param1, cn ));
   1450         }
   1451         else
   1452         {
   1453             const int r = (param1 - 1) / 2;
   1454             const int CACHE_SIZE = (int) ( 0.95 * 256 * 1024 / cn );  // assume a 256 kB cache size
   1455             const int STRIPES = (int) cvCeil( (double) (size.width - 2*r) /
   1456                     (CACHE_SIZE / sizeof(Histogram) - 2*r) );
   1457             const int STRIPE_SIZE = (int) cvCeil(
   1458                     (double) ( size.width + STRIPES*2*r - 2*r ) / STRIPES );
   1459 
   1460             for( int i = 0; i < size.width; i += STRIPE_SIZE - 2*r )
   1461             {
   1462                 int stripe = STRIPE_SIZE;
   1463                 // Make sure that the filter kernel fits into one stripe.
   1464                 if( i + STRIPE_SIZE - 2*r >= size.width ||
   1465                     size.width - (i + STRIPE_SIZE - 2*r) < 2*r+1 )
   1466                     stripe = size.width - i;
   1467 
   1468                 IPPI_CALL( icvMedianBlur_8u_CnR_O1( src->data.ptr + cn*i, src->step,
   1469                     dst->data.ptr + cn*i, dst->step, cvSize(stripe, size.height),
   1470                     param1, cn, i == 0, stripe == size.width - i ));
   1471 
   1472                 if( stripe == size.width - i )
   1473                     break;
   1474             }
   1475         }
   1476     }
   1477     else if( smooth_type == CV_GAUSSIAN )
   1478     {
   1479         CvSize ksize = { param1, param2 };
   1480         float* kx = (float*)cvStackAlloc( ksize.width*sizeof(kx[0]) );
   1481         float* ky = (float*)cvStackAlloc( ksize.height*sizeof(ky[0]) );
   1482         CvMat KX = cvMat( 1, ksize.width, CV_32F, kx );
   1483         CvMat KY = cvMat( 1, ksize.height, CV_32F, ky );
   1484 
   1485         CvSepFilter::init_gaussian_kernel( &KX, sigma1 );
   1486         if( ksize.width != ksize.height || fabs(sigma1 - sigma2) > FLT_EPSILON )
   1487             CvSepFilter::init_gaussian_kernel( &KY, sigma2 );
   1488         else
   1489             KY.data.fl = kx;
   1490 
   1491         if( have_ipp && size.width >= param1*3 &&
   1492             size.height >= param2 && param1 > 1 && param2 > 1 )
   1493         {
   1494             int done;
   1495             CV_CALL( done = icvIPPSepFilter( src, dst, &KX, &KY,
   1496                         cvPoint(ksize.width/2,ksize.height/2)));
   1497             if( done )
   1498                 EXIT;
   1499         }
   1500 
   1501         CV_CALL( gaussian_filter.init( src->cols, src_type, dst_type, &KX, &KY ));
   1502         CV_CALL( gaussian_filter.process( src, dst ));
   1503     }
   1504     else if( smooth_type == CV_BILATERAL )
   1505     {
   1506         if( param2 != 0 && (param2 != param1 || param1 % 2 == 0) )
   1507             CV_ERROR( CV_StsBadSize, "Bilateral filter only supports square windows of odd size" );
   1508 
   1509         switch( src_type )
   1510         {
   1511         case CV_32FC1:
   1512         case CV_32FC3:
   1513             CV_CALL( icvBilateralFiltering_32f( src, dst, param1, param3, param4 ));
   1514             break;
   1515         case CV_8UC1:
   1516         case CV_8UC3:
   1517             CV_CALL( icvBilateralFiltering_8u( src, dst, param1, param3, param4 ));
   1518             break;
   1519         default:
   1520             CV_ERROR( CV_StsUnsupportedFormat,
   1521                 "Unknown/unsupported format: bilateral filter only supports 8uC1, 8uC3, 32fC1 and 32fC3 formats" );
   1522         }
   1523     }
   1524 
   1525     __END__;
   1526 
   1527     cvReleaseMat( &temp );
   1528 }
   1529 
   1530 /* End of file. */
   1531