1 /*M/////////////////////////////////////////////////////////////////////////////////////// 2 // 3 // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. 4 // 5 // By downloading, copying, installing or using the software you agree to this license. 6 // If you do not agree to this license, do not download, install, 7 // copy or use the software. 8 // 9 // 10 // License Agreement 11 // For Open Source Computer Vision Library 12 // 13 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved. 14 // Copyright (C) 2009-2011, Willow Garage Inc., all rights reserved. 15 // Third party copyrights are property of their respective owners. 16 // 17 // Redistribution and use in source and binary forms, with or without modification, 18 // are permitted provided that the following conditions are met: 19 // 20 // * Redistribution's of source code must retain the above copyright notice, 21 // this list of conditions and the following disclaimer. 22 // 23 // * Redistribution's in binary form must reproduce the above copyright notice, 24 // this list of conditions and the following disclaimer in the documentation 25 // and/or other materials provided with the distribution. 26 // 27 // * The name of the copyright holders may not be used to endorse or promote products 28 // derived from this software without specific prior written permission. 29 // 30 // This software is provided by the copyright holders and contributors "as is" and 31 // any express or implied warranties, including, but not limited to, the implied 32 // warranties of merchantability and fitness for a particular purpose are disclaimed. 33 // In no event shall the Intel Corporation or contributors be liable for any direct, 34 // indirect, incidental, special, exemplary, or consequential damages 35 // (including, but not limited to, procurement of substitute goods or services; 36 // loss of use, data, or profits; or business interruption) however caused 37 // and on any theory of liability, whether in contract, strict liability, 38 // or tort (including negligence or otherwise) arising in any way out of 39 // the use of this software, even if advised of the possibility of such damage. 40 // 41 //M*/ 42 43 #include "precomp.hpp" 44 45 namespace cv { namespace hal { 46 47 static const uchar popCountTable[] = 48 { 49 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 50 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 51 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 52 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 53 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 54 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 55 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 56 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8 57 }; 58 59 static const uchar popCountTable2[] = 60 { 61 0, 1, 1, 1, 1, 2, 2, 2, 1, 2, 2, 2, 1, 2, 2, 2, 1, 2, 2, 2, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 62 1, 2, 2, 2, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 1, 2, 2, 2, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 63 1, 2, 2, 2, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4, 64 2, 3, 3, 3, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4, 2, 3, 3, 3, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4, 65 1, 2, 2, 2, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4, 66 2, 3, 3, 3, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4, 2, 3, 3, 3, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4, 67 1, 2, 2, 2, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4, 68 2, 3, 3, 3, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4, 2, 3, 3, 3, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4 69 }; 70 71 static const uchar popCountTable4[] = 72 { 73 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 74 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 75 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 76 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 77 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 78 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 79 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 80 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 81 }; 82 83 int normHamming(const uchar* a, int n) 84 { 85 int i = 0; 86 int result = 0; 87 #if CV_NEON 88 { 89 uint32x4_t bits = vmovq_n_u32(0); 90 for (; i <= n - 16; i += 16) { 91 uint8x16_t A_vec = vld1q_u8 (a + i); 92 uint8x16_t bitsSet = vcntq_u8 (A_vec); 93 uint16x8_t bitSet8 = vpaddlq_u8 (bitsSet); 94 uint32x4_t bitSet4 = vpaddlq_u16 (bitSet8); 95 bits = vaddq_u32(bits, bitSet4); 96 } 97 uint64x2_t bitSet2 = vpaddlq_u32 (bits); 98 result = vgetq_lane_s32 (vreinterpretq_s32_u64(bitSet2),0); 99 result += vgetq_lane_s32 (vreinterpretq_s32_u64(bitSet2),2); 100 } 101 #endif 102 for( ; i <= n - 4; i += 4 ) 103 result += popCountTable[a[i]] + popCountTable[a[i+1]] + 104 popCountTable[a[i+2]] + popCountTable[a[i+3]]; 105 for( ; i < n; i++ ) 106 result += popCountTable[a[i]]; 107 return result; 108 } 109 110 int normHamming(const uchar* a, const uchar* b, int n) 111 { 112 int i = 0; 113 int result = 0; 114 #if CV_NEON 115 { 116 uint32x4_t bits = vmovq_n_u32(0); 117 for (; i <= n - 16; i += 16) { 118 uint8x16_t A_vec = vld1q_u8 (a + i); 119 uint8x16_t B_vec = vld1q_u8 (b + i); 120 uint8x16_t AxorB = veorq_u8 (A_vec, B_vec); 121 uint8x16_t bitsSet = vcntq_u8 (AxorB); 122 uint16x8_t bitSet8 = vpaddlq_u8 (bitsSet); 123 uint32x4_t bitSet4 = vpaddlq_u16 (bitSet8); 124 bits = vaddq_u32(bits, bitSet4); 125 } 126 uint64x2_t bitSet2 = vpaddlq_u32 (bits); 127 result = vgetq_lane_s32 (vreinterpretq_s32_u64(bitSet2),0); 128 result += vgetq_lane_s32 (vreinterpretq_s32_u64(bitSet2),2); 129 } 130 #endif 131 for( ; i <= n - 4; i += 4 ) 132 result += popCountTable[a[i] ^ b[i]] + popCountTable[a[i+1] ^ b[i+1]] + 133 popCountTable[a[i+2] ^ b[i+2]] + popCountTable[a[i+3] ^ b[i+3]]; 134 for( ; i < n; i++ ) 135 result += popCountTable[a[i] ^ b[i]]; 136 return result; 137 } 138 139 int normHamming(const uchar* a, int n, int cellSize) 140 { 141 if( cellSize == 1 ) 142 return normHamming(a, n); 143 const uchar* tab = 0; 144 if( cellSize == 2 ) 145 tab = popCountTable2; 146 else if( cellSize == 4 ) 147 tab = popCountTable4; 148 else 149 return -1; 150 int i = 0; 151 int result = 0; 152 #if CV_ENABLE_UNROLLED 153 for( ; i <= n - 4; i += 4 ) 154 result += tab[a[i]] + tab[a[i+1]] + tab[a[i+2]] + tab[a[i+3]]; 155 #endif 156 for( ; i < n; i++ ) 157 result += tab[a[i]]; 158 return result; 159 } 160 161 int normHamming(const uchar* a, const uchar* b, int n, int cellSize) 162 { 163 if( cellSize == 1 ) 164 return normHamming(a, b, n); 165 const uchar* tab = 0; 166 if( cellSize == 2 ) 167 tab = popCountTable2; 168 else if( cellSize == 4 ) 169 tab = popCountTable4; 170 else 171 return -1; 172 int i = 0; 173 int result = 0; 174 #if CV_ENABLE_UNROLLED 175 for( ; i <= n - 4; i += 4 ) 176 result += tab[a[i] ^ b[i]] + tab[a[i+1] ^ b[i+1]] + 177 tab[a[i+2] ^ b[i+2]] + tab[a[i+3] ^ b[i+3]]; 178 #endif 179 for( ; i < n; i++ ) 180 result += tab[a[i] ^ b[i]]; 181 return result; 182 } 183 184 float normL2Sqr_(const float* a, const float* b, int n) 185 { 186 int j = 0; float d = 0.f; 187 #if CV_SSE 188 float CV_DECL_ALIGNED(16) buf[4]; 189 __m128 d0 = _mm_setzero_ps(), d1 = _mm_setzero_ps(); 190 191 for( ; j <= n - 8; j += 8 ) 192 { 193 __m128 t0 = _mm_sub_ps(_mm_loadu_ps(a + j), _mm_loadu_ps(b + j)); 194 __m128 t1 = _mm_sub_ps(_mm_loadu_ps(a + j + 4), _mm_loadu_ps(b + j + 4)); 195 d0 = _mm_add_ps(d0, _mm_mul_ps(t0, t0)); 196 d1 = _mm_add_ps(d1, _mm_mul_ps(t1, t1)); 197 } 198 _mm_store_ps(buf, _mm_add_ps(d0, d1)); 199 d = buf[0] + buf[1] + buf[2] + buf[3]; 200 #endif 201 { 202 for( ; j <= n - 4; j += 4 ) 203 { 204 float t0 = a[j] - b[j], t1 = a[j+1] - b[j+1], t2 = a[j+2] - b[j+2], t3 = a[j+3] - b[j+3]; 205 d += t0*t0 + t1*t1 + t2*t2 + t3*t3; 206 } 207 } 208 209 for( ; j < n; j++ ) 210 { 211 float t = a[j] - b[j]; 212 d += t*t; 213 } 214 return d; 215 } 216 217 218 float normL1_(const float* a, const float* b, int n) 219 { 220 int j = 0; float d = 0.f; 221 #if CV_SSE 222 float CV_DECL_ALIGNED(16) buf[4]; 223 static const int CV_DECL_ALIGNED(16) absbuf[4] = {0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff}; 224 __m128 d0 = _mm_setzero_ps(), d1 = _mm_setzero_ps(); 225 __m128 absmask = _mm_load_ps((const float*)absbuf); 226 227 for( ; j <= n - 8; j += 8 ) 228 { 229 __m128 t0 = _mm_sub_ps(_mm_loadu_ps(a + j), _mm_loadu_ps(b + j)); 230 __m128 t1 = _mm_sub_ps(_mm_loadu_ps(a + j + 4), _mm_loadu_ps(b + j + 4)); 231 d0 = _mm_add_ps(d0, _mm_and_ps(t0, absmask)); 232 d1 = _mm_add_ps(d1, _mm_and_ps(t1, absmask)); 233 } 234 _mm_store_ps(buf, _mm_add_ps(d0, d1)); 235 d = buf[0] + buf[1] + buf[2] + buf[3]; 236 #elif CV_NEON 237 float32x4_t v_sum = vdupq_n_f32(0.0f); 238 for ( ; j <= n - 4; j += 4) 239 v_sum = vaddq_f32(v_sum, vabdq_f32(vld1q_f32(a + j), vld1q_f32(b + j))); 240 241 float CV_DECL_ALIGNED(16) buf[4]; 242 vst1q_f32(buf, v_sum); 243 d = buf[0] + buf[1] + buf[2] + buf[3]; 244 #endif 245 { 246 for( ; j <= n - 4; j += 4 ) 247 { 248 d += std::abs(a[j] - b[j]) + std::abs(a[j+1] - b[j+1]) + 249 std::abs(a[j+2] - b[j+2]) + std::abs(a[j+3] - b[j+3]); 250 } 251 } 252 253 for( ; j < n; j++ ) 254 d += std::abs(a[j] - b[j]); 255 return d; 256 } 257 258 int normL1_(const uchar* a, const uchar* b, int n) 259 { 260 int j = 0, d = 0; 261 #if CV_SSE 262 __m128i d0 = _mm_setzero_si128(); 263 264 for( ; j <= n - 16; j += 16 ) 265 { 266 __m128i t0 = _mm_loadu_si128((const __m128i*)(a + j)); 267 __m128i t1 = _mm_loadu_si128((const __m128i*)(b + j)); 268 269 d0 = _mm_add_epi32(d0, _mm_sad_epu8(t0, t1)); 270 } 271 272 for( ; j <= n - 4; j += 4 ) 273 { 274 __m128i t0 = _mm_cvtsi32_si128(*(const int*)(a + j)); 275 __m128i t1 = _mm_cvtsi32_si128(*(const int*)(b + j)); 276 277 d0 = _mm_add_epi32(d0, _mm_sad_epu8(t0, t1)); 278 } 279 d = _mm_cvtsi128_si32(_mm_add_epi32(d0, _mm_unpackhi_epi64(d0, d0))); 280 #elif CV_NEON 281 uint32x4_t v_sum = vdupq_n_u32(0.0f); 282 for ( ; j <= n - 16; j += 16) 283 { 284 uint8x16_t v_dst = vabdq_u8(vld1q_u8(a + j), vld1q_u8(b + j)); 285 uint16x8_t v_low = vmovl_u8(vget_low_u8(v_dst)), v_high = vmovl_u8(vget_high_u8(v_dst)); 286 v_sum = vaddq_u32(v_sum, vaddl_u16(vget_low_u16(v_low), vget_low_u16(v_high))); 287 v_sum = vaddq_u32(v_sum, vaddl_u16(vget_high_u16(v_low), vget_high_u16(v_high))); 288 } 289 290 uint CV_DECL_ALIGNED(16) buf[4]; 291 vst1q_u32(buf, v_sum); 292 d = buf[0] + buf[1] + buf[2] + buf[3]; 293 #endif 294 { 295 for( ; j <= n - 4; j += 4 ) 296 { 297 d += std::abs(a[j] - b[j]) + std::abs(a[j+1] - b[j+1]) + 298 std::abs(a[j+2] - b[j+2]) + std::abs(a[j+3] - b[j+3]); 299 } 300 } 301 for( ; j < n; j++ ) 302 d += std::abs(a[j] - b[j]); 303 return d; 304 } 305 306 }} //cv::hal 307