Home | History | Annotate | Download | only in src
      1 /*M///////////////////////////////////////////////////////////////////////////////////////
      2 //
      3 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
      4 //
      5 //  By downloading, copying, installing or using the software you agree to this license.
      6 //  If you do not agree to this license, do not download, install,
      7 //  copy or use the software.
      8 //
      9 //
     10 //                           License Agreement
     11 //                For Open Source Computer Vision Library
     12 //
     13 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
     14 // Copyright (C) 2009-2011, Willow Garage Inc., all rights reserved.
     15 // Third party copyrights are property of their respective owners.
     16 //
     17 // Redistribution and use in source and binary forms, with or without modification,
     18 // are permitted provided that the following conditions are met:
     19 //
     20 //   * Redistribution's of source code must retain the above copyright notice,
     21 //     this list of conditions and the following disclaimer.
     22 //
     23 //   * Redistribution's in binary form must reproduce the above copyright notice,
     24 //     this list of conditions and the following disclaimer in the documentation
     25 //     and/or other materials provided with the distribution.
     26 //
     27 //   * The name of the copyright holders may not be used to endorse or promote products
     28 //     derived from this software without specific prior written permission.
     29 //
     30 // This software is provided by the copyright holders and contributors "as is" and
     31 // any express or implied warranties, including, but not limited to, the implied
     32 // warranties of merchantability and fitness for a particular purpose are disclaimed.
     33 // In no event shall the Intel Corporation or contributors be liable for any direct,
     34 // indirect, incidental, special, exemplary, or consequential damages
     35 // (including, but not limited to, procurement of substitute goods or services;
     36 // loss of use, data, or profits; or business interruption) however caused
     37 // and on any theory of liability, whether in contract, strict liability,
     38 // or tort (including negligence or otherwise) arising in any way out of
     39 // the use of this software, even if advised of the possibility of such damage.
     40 //
     41 //M*/
     42 
     43 #include "precomp.hpp"
     44 
     45 namespace cv { namespace hal {
     46 
     47 static const uchar popCountTable[] =
     48 {
     49     0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
     50     1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
     51     1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
     52     2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
     53     1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
     54     2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
     55     2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
     56     3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8
     57 };
     58 
     59 static const uchar popCountTable2[] =
     60 {
     61     0, 1, 1, 1, 1, 2, 2, 2, 1, 2, 2, 2, 1, 2, 2, 2, 1, 2, 2, 2, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3,
     62     1, 2, 2, 2, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 1, 2, 2, 2, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3,
     63     1, 2, 2, 2, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4,
     64     2, 3, 3, 3, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4, 2, 3, 3, 3, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4,
     65     1, 2, 2, 2, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4,
     66     2, 3, 3, 3, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4, 2, 3, 3, 3, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4,
     67     1, 2, 2, 2, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4,
     68     2, 3, 3, 3, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4, 2, 3, 3, 3, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4
     69 };
     70 
     71 static const uchar popCountTable4[] =
     72 {
     73     0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
     74     1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
     75     1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
     76     1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
     77     1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
     78     1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
     79     1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
     80     1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
     81 };
     82 
     83 int normHamming(const uchar* a, int n)
     84 {
     85     int i = 0;
     86     int result = 0;
     87 #if CV_NEON
     88     {
     89         uint32x4_t bits = vmovq_n_u32(0);
     90         for (; i <= n - 16; i += 16) {
     91             uint8x16_t A_vec = vld1q_u8 (a + i);
     92             uint8x16_t bitsSet = vcntq_u8 (A_vec);
     93             uint16x8_t bitSet8 = vpaddlq_u8 (bitsSet);
     94             uint32x4_t bitSet4 = vpaddlq_u16 (bitSet8);
     95             bits = vaddq_u32(bits, bitSet4);
     96         }
     97         uint64x2_t bitSet2 = vpaddlq_u32 (bits);
     98         result = vgetq_lane_s32 (vreinterpretq_s32_u64(bitSet2),0);
     99         result += vgetq_lane_s32 (vreinterpretq_s32_u64(bitSet2),2);
    100     }
    101 #endif
    102         for( ; i <= n - 4; i += 4 )
    103             result += popCountTable[a[i]] + popCountTable[a[i+1]] +
    104             popCountTable[a[i+2]] + popCountTable[a[i+3]];
    105     for( ; i < n; i++ )
    106         result += popCountTable[a[i]];
    107     return result;
    108 }
    109 
    110 int normHamming(const uchar* a, const uchar* b, int n)
    111 {
    112     int i = 0;
    113     int result = 0;
    114 #if CV_NEON
    115     {
    116         uint32x4_t bits = vmovq_n_u32(0);
    117         for (; i <= n - 16; i += 16) {
    118             uint8x16_t A_vec = vld1q_u8 (a + i);
    119             uint8x16_t B_vec = vld1q_u8 (b + i);
    120             uint8x16_t AxorB = veorq_u8 (A_vec, B_vec);
    121             uint8x16_t bitsSet = vcntq_u8 (AxorB);
    122             uint16x8_t bitSet8 = vpaddlq_u8 (bitsSet);
    123             uint32x4_t bitSet4 = vpaddlq_u16 (bitSet8);
    124             bits = vaddq_u32(bits, bitSet4);
    125         }
    126         uint64x2_t bitSet2 = vpaddlq_u32 (bits);
    127         result = vgetq_lane_s32 (vreinterpretq_s32_u64(bitSet2),0);
    128         result += vgetq_lane_s32 (vreinterpretq_s32_u64(bitSet2),2);
    129     }
    130 #endif
    131         for( ; i <= n - 4; i += 4 )
    132             result += popCountTable[a[i] ^ b[i]] + popCountTable[a[i+1] ^ b[i+1]] +
    133                     popCountTable[a[i+2] ^ b[i+2]] + popCountTable[a[i+3] ^ b[i+3]];
    134     for( ; i < n; i++ )
    135         result += popCountTable[a[i] ^ b[i]];
    136     return result;
    137 }
    138 
    139 int normHamming(const uchar* a, int n, int cellSize)
    140 {
    141     if( cellSize == 1 )
    142         return normHamming(a, n);
    143     const uchar* tab = 0;
    144     if( cellSize == 2 )
    145         tab = popCountTable2;
    146     else if( cellSize == 4 )
    147         tab = popCountTable4;
    148     else
    149         return -1;
    150     int i = 0;
    151     int result = 0;
    152 #if CV_ENABLE_UNROLLED
    153     for( ; i <= n - 4; i += 4 )
    154         result += tab[a[i]] + tab[a[i+1]] + tab[a[i+2]] + tab[a[i+3]];
    155 #endif
    156     for( ; i < n; i++ )
    157         result += tab[a[i]];
    158     return result;
    159 }
    160 
    161 int normHamming(const uchar* a, const uchar* b, int n, int cellSize)
    162 {
    163     if( cellSize == 1 )
    164         return normHamming(a, b, n);
    165     const uchar* tab = 0;
    166     if( cellSize == 2 )
    167         tab = popCountTable2;
    168     else if( cellSize == 4 )
    169         tab = popCountTable4;
    170     else
    171         return -1;
    172     int i = 0;
    173     int result = 0;
    174     #if CV_ENABLE_UNROLLED
    175     for( ; i <= n - 4; i += 4 )
    176         result += tab[a[i] ^ b[i]] + tab[a[i+1] ^ b[i+1]] +
    177                 tab[a[i+2] ^ b[i+2]] + tab[a[i+3] ^ b[i+3]];
    178     #endif
    179     for( ; i < n; i++ )
    180         result += tab[a[i] ^ b[i]];
    181     return result;
    182 }
    183 
    184 float normL2Sqr_(const float* a, const float* b, int n)
    185 {
    186     int j = 0; float d = 0.f;
    187 #if CV_SSE
    188     float CV_DECL_ALIGNED(16) buf[4];
    189     __m128 d0 = _mm_setzero_ps(), d1 = _mm_setzero_ps();
    190 
    191     for( ; j <= n - 8; j += 8 )
    192     {
    193         __m128 t0 = _mm_sub_ps(_mm_loadu_ps(a + j), _mm_loadu_ps(b + j));
    194         __m128 t1 = _mm_sub_ps(_mm_loadu_ps(a + j + 4), _mm_loadu_ps(b + j + 4));
    195         d0 = _mm_add_ps(d0, _mm_mul_ps(t0, t0));
    196         d1 = _mm_add_ps(d1, _mm_mul_ps(t1, t1));
    197     }
    198     _mm_store_ps(buf, _mm_add_ps(d0, d1));
    199     d = buf[0] + buf[1] + buf[2] + buf[3];
    200 #endif
    201     {
    202         for( ; j <= n - 4; j += 4 )
    203         {
    204             float t0 = a[j] - b[j], t1 = a[j+1] - b[j+1], t2 = a[j+2] - b[j+2], t3 = a[j+3] - b[j+3];
    205             d += t0*t0 + t1*t1 + t2*t2 + t3*t3;
    206         }
    207     }
    208 
    209     for( ; j < n; j++ )
    210     {
    211         float t = a[j] - b[j];
    212         d += t*t;
    213     }
    214     return d;
    215 }
    216 
    217 
    218 float normL1_(const float* a, const float* b, int n)
    219 {
    220     int j = 0; float d = 0.f;
    221 #if CV_SSE
    222     float CV_DECL_ALIGNED(16) buf[4];
    223     static const int CV_DECL_ALIGNED(16) absbuf[4] = {0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff};
    224     __m128 d0 = _mm_setzero_ps(), d1 = _mm_setzero_ps();
    225     __m128 absmask = _mm_load_ps((const float*)absbuf);
    226 
    227     for( ; j <= n - 8; j += 8 )
    228     {
    229         __m128 t0 = _mm_sub_ps(_mm_loadu_ps(a + j), _mm_loadu_ps(b + j));
    230         __m128 t1 = _mm_sub_ps(_mm_loadu_ps(a + j + 4), _mm_loadu_ps(b + j + 4));
    231         d0 = _mm_add_ps(d0, _mm_and_ps(t0, absmask));
    232         d1 = _mm_add_ps(d1, _mm_and_ps(t1, absmask));
    233     }
    234     _mm_store_ps(buf, _mm_add_ps(d0, d1));
    235     d = buf[0] + buf[1] + buf[2] + buf[3];
    236 #elif CV_NEON
    237     float32x4_t v_sum = vdupq_n_f32(0.0f);
    238     for ( ; j <= n - 4; j += 4)
    239         v_sum = vaddq_f32(v_sum, vabdq_f32(vld1q_f32(a + j), vld1q_f32(b + j)));
    240 
    241     float CV_DECL_ALIGNED(16) buf[4];
    242     vst1q_f32(buf, v_sum);
    243     d = buf[0] + buf[1] + buf[2] + buf[3];
    244 #endif
    245     {
    246         for( ; j <= n - 4; j += 4 )
    247         {
    248             d += std::abs(a[j] - b[j]) + std::abs(a[j+1] - b[j+1]) +
    249             std::abs(a[j+2] - b[j+2]) + std::abs(a[j+3] - b[j+3]);
    250         }
    251     }
    252 
    253     for( ; j < n; j++ )
    254         d += std::abs(a[j] - b[j]);
    255     return d;
    256 }
    257 
    258 int normL1_(const uchar* a, const uchar* b, int n)
    259 {
    260     int j = 0, d = 0;
    261 #if CV_SSE
    262     __m128i d0 = _mm_setzero_si128();
    263 
    264     for( ; j <= n - 16; j += 16 )
    265     {
    266         __m128i t0 = _mm_loadu_si128((const __m128i*)(a + j));
    267         __m128i t1 = _mm_loadu_si128((const __m128i*)(b + j));
    268 
    269         d0 = _mm_add_epi32(d0, _mm_sad_epu8(t0, t1));
    270     }
    271 
    272     for( ; j <= n - 4; j += 4 )
    273     {
    274         __m128i t0 = _mm_cvtsi32_si128(*(const int*)(a + j));
    275         __m128i t1 = _mm_cvtsi32_si128(*(const int*)(b + j));
    276 
    277         d0 = _mm_add_epi32(d0, _mm_sad_epu8(t0, t1));
    278     }
    279     d = _mm_cvtsi128_si32(_mm_add_epi32(d0, _mm_unpackhi_epi64(d0, d0)));
    280 #elif CV_NEON
    281     uint32x4_t v_sum = vdupq_n_u32(0.0f);
    282     for ( ; j <= n - 16; j += 16)
    283     {
    284         uint8x16_t v_dst = vabdq_u8(vld1q_u8(a + j), vld1q_u8(b + j));
    285         uint16x8_t v_low = vmovl_u8(vget_low_u8(v_dst)), v_high = vmovl_u8(vget_high_u8(v_dst));
    286         v_sum = vaddq_u32(v_sum, vaddl_u16(vget_low_u16(v_low), vget_low_u16(v_high)));
    287         v_sum = vaddq_u32(v_sum, vaddl_u16(vget_high_u16(v_low), vget_high_u16(v_high)));
    288     }
    289 
    290     uint CV_DECL_ALIGNED(16) buf[4];
    291     vst1q_u32(buf, v_sum);
    292     d = buf[0] + buf[1] + buf[2] + buf[3];
    293 #endif
    294     {
    295         for( ; j <= n - 4; j += 4 )
    296         {
    297             d += std::abs(a[j] - b[j]) + std::abs(a[j+1] - b[j+1]) +
    298             std::abs(a[j+2] - b[j+2]) + std::abs(a[j+3] - b[j+3]);
    299         }
    300     }
    301     for( ; j < n; j++ )
    302         d += std::abs(a[j] - b[j]);
    303     return d;
    304 }
    305 
    306 }} //cv::hal
    307