1 /*M/////////////////////////////////////////////////////////////////////////////////////// 2 // 3 // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. 4 // 5 // By downloading, copying, installing or using the software you agree to this license. 6 // If you do not agree to this license, do not download, install, 7 // copy or use the software. 8 // 9 // 10 // License Agreement 11 // For Open Source Computer Vision Library 12 // 13 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved. 14 // Copyright (C) 2009, Willow Garage Inc., all rights reserved. 15 // Copyright (C) 2014-2015, Itseez Inc., all rights reserved. 16 // Third party copyrights are property of their respective owners. 17 // 18 // Redistribution and use in source and binary forms, with or without modification, 19 // are permitted provided that the following conditions are met: 20 // 21 // * Redistribution's of source code must retain the above copyright notice, 22 // this list of conditions and the following disclaimer. 23 // 24 // * Redistribution's in binary form must reproduce the above copyright notice, 25 // this list of conditions and the following disclaimer in the documentation 26 // and/or other materials provided with the distribution. 27 // 28 // * The name of the copyright holders may not be used to endorse or promote products 29 // derived from this software without specific prior written permission. 30 // 31 // This software is provided by the copyright holders and contributors "as is" and 32 // any express or implied warranties, including, but not limited to, the implied 33 // warranties of merchantability and fitness for a particular purpose are disclaimed. 34 // In no event shall the Intel Corporation or contributors be liable for any direct, 35 // indirect, incidental, special, exemplary, or consequential damages 36 // (including, but not limited to, procurement of substitute goods or services; 37 // loss of use, data, or profits; or business interruption) however caused 38 // and on any theory of liability, whether in contract, strict liability, 39 // or tort (including negligence or otherwise) arising in any way out of 40 // the use of this software, even if advised of the possibility of such damage. 41 // 42 //M*/ 43 44 #include "precomp.hpp" 45 #include "opencl_kernels_imgproc.hpp" 46 47 namespace cv 48 { 49 50 template<typename T, int shift> struct FixPtCast 51 { 52 typedef int type1; 53 typedef T rtype; 54 rtype operator ()(type1 arg) const { return (T)((arg + (1 << (shift-1))) >> shift); } 55 }; 56 57 template<typename T, int shift> struct FltCast 58 { 59 typedef T type1; 60 typedef T rtype; 61 rtype operator ()(type1 arg) const { return arg*(T)(1./(1 << shift)); } 62 }; 63 64 template<typename T1, typename T2> struct PyrDownNoVec 65 { 66 int operator()(T1**, T2*, int, int) const { return 0; } 67 }; 68 69 template<typename T1, typename T2> struct PyrUpNoVec 70 { 71 int operator()(T1**, T2**, int, int) const { return 0; } 72 }; 73 74 #if CV_SSE2 75 76 struct PyrDownVec_32s8u 77 { 78 int operator()(int** src, uchar* dst, int, int width) const 79 { 80 if( !checkHardwareSupport(CV_CPU_SSE2) ) 81 return 0; 82 83 int x = 0; 84 const int *row0 = src[0], *row1 = src[1], *row2 = src[2], *row3 = src[3], *row4 = src[4]; 85 __m128i delta = _mm_set1_epi16(128); 86 87 for( ; x <= width - 16; x += 16 ) 88 { 89 __m128i r0, r1, r2, r3, r4, t0, t1; 90 r0 = _mm_packs_epi32(_mm_load_si128((const __m128i*)(row0 + x)), 91 _mm_load_si128((const __m128i*)(row0 + x + 4))); 92 r1 = _mm_packs_epi32(_mm_load_si128((const __m128i*)(row1 + x)), 93 _mm_load_si128((const __m128i*)(row1 + x + 4))); 94 r2 = _mm_packs_epi32(_mm_load_si128((const __m128i*)(row2 + x)), 95 _mm_load_si128((const __m128i*)(row2 + x + 4))); 96 r3 = _mm_packs_epi32(_mm_load_si128((const __m128i*)(row3 + x)), 97 _mm_load_si128((const __m128i*)(row3 + x + 4))); 98 r4 = _mm_packs_epi32(_mm_load_si128((const __m128i*)(row4 + x)), 99 _mm_load_si128((const __m128i*)(row4 + x + 4))); 100 r0 = _mm_add_epi16(r0, r4); 101 r1 = _mm_add_epi16(_mm_add_epi16(r1, r3), r2); 102 r0 = _mm_add_epi16(r0, _mm_add_epi16(r2, r2)); 103 t0 = _mm_add_epi16(r0, _mm_slli_epi16(r1, 2)); 104 r0 = _mm_packs_epi32(_mm_load_si128((const __m128i*)(row0 + x + 8)), 105 _mm_load_si128((const __m128i*)(row0 + x + 12))); 106 r1 = _mm_packs_epi32(_mm_load_si128((const __m128i*)(row1 + x + 8)), 107 _mm_load_si128((const __m128i*)(row1 + x + 12))); 108 r2 = _mm_packs_epi32(_mm_load_si128((const __m128i*)(row2 + x + 8)), 109 _mm_load_si128((const __m128i*)(row2 + x + 12))); 110 r3 = _mm_packs_epi32(_mm_load_si128((const __m128i*)(row3 + x + 8)), 111 _mm_load_si128((const __m128i*)(row3 + x + 12))); 112 r4 = _mm_packs_epi32(_mm_load_si128((const __m128i*)(row4 + x + 8)), 113 _mm_load_si128((const __m128i*)(row4 + x + 12))); 114 r0 = _mm_add_epi16(r0, r4); 115 r1 = _mm_add_epi16(_mm_add_epi16(r1, r3), r2); 116 r0 = _mm_add_epi16(r0, _mm_add_epi16(r2, r2)); 117 t1 = _mm_add_epi16(r0, _mm_slli_epi16(r1, 2)); 118 t0 = _mm_srli_epi16(_mm_add_epi16(t0, delta), 8); 119 t1 = _mm_srli_epi16(_mm_add_epi16(t1, delta), 8); 120 _mm_storeu_si128((__m128i*)(dst + x), _mm_packus_epi16(t0, t1)); 121 } 122 123 for( ; x <= width - 4; x += 4 ) 124 { 125 __m128i r0, r1, r2, r3, r4, z = _mm_setzero_si128(); 126 r0 = _mm_packs_epi32(_mm_load_si128((const __m128i*)(row0 + x)), z); 127 r1 = _mm_packs_epi32(_mm_load_si128((const __m128i*)(row1 + x)), z); 128 r2 = _mm_packs_epi32(_mm_load_si128((const __m128i*)(row2 + x)), z); 129 r3 = _mm_packs_epi32(_mm_load_si128((const __m128i*)(row3 + x)), z); 130 r4 = _mm_packs_epi32(_mm_load_si128((const __m128i*)(row4 + x)), z); 131 r0 = _mm_add_epi16(r0, r4); 132 r1 = _mm_add_epi16(_mm_add_epi16(r1, r3), r2); 133 r0 = _mm_add_epi16(r0, _mm_add_epi16(r2, r2)); 134 r0 = _mm_add_epi16(r0, _mm_slli_epi16(r1, 2)); 135 r0 = _mm_srli_epi16(_mm_add_epi16(r0, delta), 8); 136 *(int*)(dst + x) = _mm_cvtsi128_si32(_mm_packus_epi16(r0, r0)); 137 } 138 139 return x; 140 } 141 }; 142 143 struct PyrDownVec_32f 144 { 145 int operator()(float** src, float* dst, int, int width) const 146 { 147 if( !checkHardwareSupport(CV_CPU_SSE) ) 148 return 0; 149 150 int x = 0; 151 const float *row0 = src[0], *row1 = src[1], *row2 = src[2], *row3 = src[3], *row4 = src[4]; 152 __m128 _4 = _mm_set1_ps(4.f), _scale = _mm_set1_ps(1.f/256); 153 for( ; x <= width - 8; x += 8 ) 154 { 155 __m128 r0, r1, r2, r3, r4, t0, t1; 156 r0 = _mm_load_ps(row0 + x); 157 r1 = _mm_load_ps(row1 + x); 158 r2 = _mm_load_ps(row2 + x); 159 r3 = _mm_load_ps(row3 + x); 160 r4 = _mm_load_ps(row4 + x); 161 r0 = _mm_add_ps(r0, r4); 162 r1 = _mm_add_ps(_mm_add_ps(r1, r3), r2); 163 r0 = _mm_add_ps(r0, _mm_add_ps(r2, r2)); 164 t0 = _mm_add_ps(r0, _mm_mul_ps(r1, _4)); 165 166 r0 = _mm_load_ps(row0 + x + 4); 167 r1 = _mm_load_ps(row1 + x + 4); 168 r2 = _mm_load_ps(row2 + x + 4); 169 r3 = _mm_load_ps(row3 + x + 4); 170 r4 = _mm_load_ps(row4 + x + 4); 171 r0 = _mm_add_ps(r0, r4); 172 r1 = _mm_add_ps(_mm_add_ps(r1, r3), r2); 173 r0 = _mm_add_ps(r0, _mm_add_ps(r2, r2)); 174 t1 = _mm_add_ps(r0, _mm_mul_ps(r1, _4)); 175 176 t0 = _mm_mul_ps(t0, _scale); 177 t1 = _mm_mul_ps(t1, _scale); 178 179 _mm_storeu_ps(dst + x, t0); 180 _mm_storeu_ps(dst + x + 4, t1); 181 } 182 183 return x; 184 } 185 }; 186 187 #if CV_SSE4_1 188 189 struct PyrDownVec_32s16u 190 { 191 PyrDownVec_32s16u() 192 { 193 haveSSE = checkHardwareSupport(CV_CPU_SSE4_1); 194 } 195 196 int operator()(int** src, ushort* dst, int, int width) const 197 { 198 int x = 0; 199 200 if (!haveSSE) 201 return x; 202 203 const int *row0 = src[0], *row1 = src[1], *row2 = src[2], *row3 = src[3], *row4 = src[4]; 204 __m128i v_delta = _mm_set1_epi32(128); 205 206 for( ; x <= width - 8; x += 8 ) 207 { 208 __m128i v_r00 = _mm_loadu_si128((__m128i const *)(row0 + x)), 209 v_r01 = _mm_loadu_si128((__m128i const *)(row0 + x + 4)); 210 __m128i v_r10 = _mm_loadu_si128((__m128i const *)(row1 + x)), 211 v_r11 = _mm_loadu_si128((__m128i const *)(row1 + x + 4)); 212 __m128i v_r20 = _mm_loadu_si128((__m128i const *)(row2 + x)), 213 v_r21 = _mm_loadu_si128((__m128i const *)(row2 + x + 4)); 214 __m128i v_r30 = _mm_loadu_si128((__m128i const *)(row3 + x)), 215 v_r31 = _mm_loadu_si128((__m128i const *)(row3 + x + 4)); 216 __m128i v_r40 = _mm_loadu_si128((__m128i const *)(row4 + x)), 217 v_r41 = _mm_loadu_si128((__m128i const *)(row4 + x + 4)); 218 219 v_r00 = _mm_add_epi32(_mm_add_epi32(v_r00, v_r40), _mm_add_epi32(v_r20, v_r20)); 220 v_r10 = _mm_add_epi32(_mm_add_epi32(v_r10, v_r20), v_r30); 221 222 v_r10 = _mm_slli_epi32(v_r10, 2); 223 __m128i v_dst0 = _mm_srli_epi32(_mm_add_epi32(_mm_add_epi32(v_r00, v_r10), v_delta), 8); 224 225 v_r01 = _mm_add_epi32(_mm_add_epi32(v_r01, v_r41), _mm_add_epi32(v_r21, v_r21)); 226 v_r11 = _mm_add_epi32(_mm_add_epi32(v_r11, v_r21), v_r31); 227 v_r11 = _mm_slli_epi32(v_r11, 2); 228 __m128i v_dst1 = _mm_srli_epi32(_mm_add_epi32(_mm_add_epi32(v_r01, v_r11), v_delta), 8); 229 230 _mm_storeu_si128((__m128i *)(dst + x), _mm_packus_epi32(v_dst0, v_dst1)); 231 } 232 233 return x; 234 } 235 236 bool haveSSE; 237 }; 238 239 #else 240 241 typedef PyrDownNoVec<int, ushort> PyrDownVec_32s16u; 242 243 #endif // CV_SSE4_1 244 245 struct PyrDownVec_32s16s 246 { 247 PyrDownVec_32s16s() 248 { 249 haveSSE = checkHardwareSupport(CV_CPU_SSE2); 250 } 251 252 int operator()(int** src, short* dst, int, int width) const 253 { 254 int x = 0; 255 256 if (!haveSSE) 257 return x; 258 259 const int *row0 = src[0], *row1 = src[1], *row2 = src[2], *row3 = src[3], *row4 = src[4]; 260 __m128i v_delta = _mm_set1_epi32(128); 261 262 for( ; x <= width - 8; x += 8 ) 263 { 264 __m128i v_r00 = _mm_loadu_si128((__m128i const *)(row0 + x)), 265 v_r01 = _mm_loadu_si128((__m128i const *)(row0 + x + 4)); 266 __m128i v_r10 = _mm_loadu_si128((__m128i const *)(row1 + x)), 267 v_r11 = _mm_loadu_si128((__m128i const *)(row1 + x + 4)); 268 __m128i v_r20 = _mm_loadu_si128((__m128i const *)(row2 + x)), 269 v_r21 = _mm_loadu_si128((__m128i const *)(row2 + x + 4)); 270 __m128i v_r30 = _mm_loadu_si128((__m128i const *)(row3 + x)), 271 v_r31 = _mm_loadu_si128((__m128i const *)(row3 + x + 4)); 272 __m128i v_r40 = _mm_loadu_si128((__m128i const *)(row4 + x)), 273 v_r41 = _mm_loadu_si128((__m128i const *)(row4 + x + 4)); 274 275 v_r00 = _mm_add_epi32(_mm_add_epi32(v_r00, v_r40), _mm_add_epi32(v_r20, v_r20)); 276 v_r10 = _mm_add_epi32(_mm_add_epi32(v_r10, v_r20), v_r30); 277 278 v_r10 = _mm_slli_epi32(v_r10, 2); 279 __m128i v_dst0 = _mm_srai_epi32(_mm_add_epi32(_mm_add_epi32(v_r00, v_r10), v_delta), 8); 280 281 v_r01 = _mm_add_epi32(_mm_add_epi32(v_r01, v_r41), _mm_add_epi32(v_r21, v_r21)); 282 v_r11 = _mm_add_epi32(_mm_add_epi32(v_r11, v_r21), v_r31); 283 v_r11 = _mm_slli_epi32(v_r11, 2); 284 __m128i v_dst1 = _mm_srai_epi32(_mm_add_epi32(_mm_add_epi32(v_r01, v_r11), v_delta), 8); 285 286 _mm_storeu_si128((__m128i *)(dst + x), _mm_packs_epi32(v_dst0, v_dst1)); 287 } 288 289 return x; 290 } 291 292 bool haveSSE; 293 }; 294 295 struct PyrUpVec_32s8u 296 { 297 int operator()(int** src, uchar** dst, int, int width) const 298 { 299 int x = 0; 300 301 if (!checkHardwareSupport(CV_CPU_SSE2)) 302 return x; 303 304 uchar *dst0 = dst[0], *dst1 = dst[1]; 305 const uint *row0 = (uint *)src[0], *row1 = (uint *)src[1], *row2 = (uint *)src[2]; 306 __m128i v_delta = _mm_set1_epi16(32), v_zero = _mm_setzero_si128(); 307 308 for( ; x <= width - 16; x += 16 ) 309 { 310 __m128i v_r0 = _mm_packs_epi32(_mm_loadu_si128((__m128i const *)(row0 + x)), 311 _mm_loadu_si128((__m128i const *)(row0 + x + 4))); 312 __m128i v_r1 = _mm_packs_epi32(_mm_loadu_si128((__m128i const *)(row1 + x)), 313 _mm_loadu_si128((__m128i const *)(row1 + x + 4))); 314 __m128i v_r2 = _mm_packs_epi32(_mm_loadu_si128((__m128i const *)(row2 + x)), 315 _mm_loadu_si128((__m128i const *)(row2 + x + 4))); 316 317 __m128i v_2r1 = _mm_adds_epu16(v_r1, v_r1), v_4r1 = _mm_adds_epu16(v_2r1, v_2r1); 318 __m128i v_dst00 = _mm_adds_epu16(_mm_adds_epu16(v_r0, v_r2), _mm_adds_epu16(v_2r1, v_4r1)); 319 __m128i v_dst10 = _mm_slli_epi16(_mm_adds_epu16(v_r1, v_r2), 2); 320 321 v_r0 = _mm_packs_epi32(_mm_loadu_si128((__m128i const *)(row0 + x + 8)), 322 _mm_loadu_si128((__m128i const *)(row0 + x + 12))); 323 v_r1 = _mm_packs_epi32(_mm_loadu_si128((__m128i const *)(row1 + x + 8)), 324 _mm_loadu_si128((__m128i const *)(row1 + x + 12))); 325 v_r2 = _mm_packs_epi32(_mm_loadu_si128((__m128i const *)(row2 + x + 8)), 326 _mm_loadu_si128((__m128i const *)(row2 + x + 12))); 327 328 v_2r1 = _mm_adds_epu16(v_r1, v_r1), v_4r1 = _mm_adds_epu16(v_2r1, v_2r1); 329 __m128i v_dst01 = _mm_adds_epu16(_mm_adds_epu16(v_r0, v_r2), _mm_adds_epu16(v_2r1, v_4r1)); 330 __m128i v_dst11 = _mm_slli_epi16(_mm_adds_epu16(v_r1, v_r2), 2); 331 332 _mm_storeu_si128((__m128i *)(dst0 + x), _mm_packus_epi16(_mm_srli_epi16(_mm_adds_epu16(v_dst00, v_delta), 6), 333 _mm_srli_epi16(_mm_adds_epu16(v_dst01, v_delta), 6))); 334 _mm_storeu_si128((__m128i *)(dst1 + x), _mm_packus_epi16(_mm_srli_epi16(_mm_adds_epu16(v_dst10, v_delta), 6), 335 _mm_srli_epi16(_mm_adds_epu16(v_dst11, v_delta), 6))); 336 } 337 338 for( ; x <= width - 8; x += 8 ) 339 { 340 __m128i v_r0 = _mm_packs_epi32(_mm_loadu_si128((__m128i const *)(row0 + x)), 341 _mm_loadu_si128((__m128i const *)(row0 + x + 4))); 342 __m128i v_r1 = _mm_packs_epi32(_mm_loadu_si128((__m128i const *)(row1 + x)), 343 _mm_loadu_si128((__m128i const *)(row1 + x + 4))); 344 __m128i v_r2 = _mm_packs_epi32(_mm_loadu_si128((__m128i const *)(row2 + x)), 345 _mm_loadu_si128((__m128i const *)(row2 + x + 4))); 346 347 __m128i v_2r1 = _mm_adds_epu16(v_r1, v_r1), v_4r1 = _mm_adds_epu16(v_2r1, v_2r1); 348 __m128i v_dst0 = _mm_adds_epu16(_mm_adds_epu16(v_r0, v_r2), _mm_adds_epu16(v_2r1, v_4r1)); 349 __m128i v_dst1 = _mm_slli_epi16(_mm_adds_epu16(v_r1, v_r2), 2); 350 351 _mm_storel_epi64((__m128i *)(dst0 + x), _mm_packus_epi16(_mm_srli_epi16(_mm_adds_epu16(v_dst0, v_delta), 6), v_zero)); 352 _mm_storel_epi64((__m128i *)(dst1 + x), _mm_packus_epi16(_mm_srli_epi16(_mm_adds_epu16(v_dst1, v_delta), 6), v_zero)); 353 } 354 355 return x; 356 } 357 }; 358 359 struct PyrUpVec_32s16s 360 { 361 int operator()(int** src, short** dst, int, int width) const 362 { 363 int x = 0; 364 365 if (!checkHardwareSupport(CV_CPU_SSE2)) 366 return x; 367 368 short *dst0 = dst[0], *dst1 = dst[1]; 369 const uint *row0 = (uint *)src[0], *row1 = (uint *)src[1], *row2 = (uint *)src[2]; 370 __m128i v_delta = _mm_set1_epi32(32), v_zero = _mm_setzero_si128(); 371 372 for( ; x <= width - 8; x += 8 ) 373 { 374 __m128i v_r0 = _mm_loadu_si128((__m128i const *)(row0 + x)), 375 v_r1 = _mm_loadu_si128((__m128i const *)(row1 + x)), 376 v_r2 = _mm_loadu_si128((__m128i const *)(row2 + x)); 377 __m128i v_2r1 = _mm_slli_epi32(v_r1, 1), v_4r1 = _mm_slli_epi32(v_r1, 2); 378 __m128i v_dst00 = _mm_add_epi32(_mm_add_epi32(v_r0, v_r2), _mm_add_epi32(v_2r1, v_4r1)); 379 __m128i v_dst10 = _mm_slli_epi32(_mm_add_epi32(v_r1, v_r2), 2); 380 381 v_r0 = _mm_loadu_si128((__m128i const *)(row0 + x + 4)); 382 v_r1 = _mm_loadu_si128((__m128i const *)(row1 + x + 4)); 383 v_r2 = _mm_loadu_si128((__m128i const *)(row2 + x + 4)); 384 v_2r1 = _mm_slli_epi32(v_r1, 1); 385 v_4r1 = _mm_slli_epi32(v_r1, 2); 386 __m128i v_dst01 = _mm_add_epi32(_mm_add_epi32(v_r0, v_r2), _mm_add_epi32(v_2r1, v_4r1)); 387 __m128i v_dst11 = _mm_slli_epi32(_mm_add_epi32(v_r1, v_r2), 2); 388 389 _mm_storeu_si128((__m128i *)(dst0 + x), 390 _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(v_dst00, v_delta), 6), 391 _mm_srai_epi32(_mm_add_epi32(v_dst01, v_delta), 6))); 392 _mm_storeu_si128((__m128i *)(dst1 + x), 393 _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(v_dst10, v_delta), 6), 394 _mm_srai_epi32(_mm_add_epi32(v_dst11, v_delta), 6))); 395 } 396 397 for( ; x <= width - 4; x += 4 ) 398 { 399 __m128i v_r0 = _mm_loadu_si128((__m128i const *)(row0 + x)), 400 v_r1 = _mm_loadu_si128((__m128i const *)(row1 + x)), 401 v_r2 = _mm_loadu_si128((__m128i const *)(row2 + x)); 402 __m128i v_2r1 = _mm_slli_epi32(v_r1, 1), v_4r1 = _mm_slli_epi32(v_r1, 2); 403 404 __m128i v_dst0 = _mm_add_epi32(_mm_add_epi32(v_r0, v_r2), _mm_add_epi32(v_2r1, v_4r1)); 405 __m128i v_dst1 = _mm_slli_epi32(_mm_add_epi32(v_r1, v_r2), 2); 406 407 _mm_storel_epi64((__m128i *)(dst0 + x), 408 _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(v_dst0, v_delta), 6), v_zero)); 409 _mm_storel_epi64((__m128i *)(dst1 + x), 410 _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(v_dst1, v_delta), 6), v_zero)); 411 } 412 413 return x; 414 } 415 }; 416 417 #if CV_SSE4_1 418 419 struct PyrUpVec_32s16u 420 { 421 int operator()(int** src, ushort** dst, int, int width) const 422 { 423 int x = 0; 424 425 if (!checkHardwareSupport(CV_CPU_SSE4_1)) 426 return x; 427 428 ushort *dst0 = dst[0], *dst1 = dst[1]; 429 const uint *row0 = (uint *)src[0], *row1 = (uint *)src[1], *row2 = (uint *)src[2]; 430 __m128i v_delta = _mm_set1_epi32(32), v_zero = _mm_setzero_si128(); 431 432 for( ; x <= width - 8; x += 8 ) 433 { 434 __m128i v_r0 = _mm_loadu_si128((__m128i const *)(row0 + x)), 435 v_r1 = _mm_loadu_si128((__m128i const *)(row1 + x)), 436 v_r2 = _mm_loadu_si128((__m128i const *)(row2 + x)); 437 __m128i v_2r1 = _mm_slli_epi32(v_r1, 1), v_4r1 = _mm_slli_epi32(v_r1, 2); 438 __m128i v_dst00 = _mm_add_epi32(_mm_add_epi32(v_r0, v_r2), _mm_add_epi32(v_2r1, v_4r1)); 439 __m128i v_dst10 = _mm_slli_epi32(_mm_add_epi32(v_r1, v_r2), 2); 440 441 v_r0 = _mm_loadu_si128((__m128i const *)(row0 + x + 4)); 442 v_r1 = _mm_loadu_si128((__m128i const *)(row1 + x + 4)); 443 v_r2 = _mm_loadu_si128((__m128i const *)(row2 + x + 4)); 444 v_2r1 = _mm_slli_epi32(v_r1, 1); 445 v_4r1 = _mm_slli_epi32(v_r1, 2); 446 __m128i v_dst01 = _mm_add_epi32(_mm_add_epi32(v_r0, v_r2), _mm_add_epi32(v_2r1, v_4r1)); 447 __m128i v_dst11 = _mm_slli_epi32(_mm_add_epi32(v_r1, v_r2), 2); 448 449 _mm_storeu_si128((__m128i *)(dst0 + x), 450 _mm_packus_epi32(_mm_srli_epi32(_mm_add_epi32(v_dst00, v_delta), 6), 451 _mm_srli_epi32(_mm_add_epi32(v_dst01, v_delta), 6))); 452 _mm_storeu_si128((__m128i *)(dst1 + x), 453 _mm_packus_epi32(_mm_srli_epi32(_mm_add_epi32(v_dst10, v_delta), 6), 454 _mm_srli_epi32(_mm_add_epi32(v_dst11, v_delta), 6))); 455 } 456 457 for( ; x <= width - 4; x += 4 ) 458 { 459 __m128i v_r0 = _mm_loadu_si128((__m128i const *)(row0 + x)), 460 v_r1 = _mm_loadu_si128((__m128i const *)(row1 + x)), 461 v_r2 = _mm_loadu_si128((__m128i const *)(row2 + x)); 462 __m128i v_2r1 = _mm_slli_epi32(v_r1, 1), v_4r1 = _mm_slli_epi32(v_r1, 2); 463 464 __m128i v_dst0 = _mm_add_epi32(_mm_add_epi32(v_r0, v_r2), _mm_add_epi32(v_2r1, v_4r1)); 465 __m128i v_dst1 = _mm_slli_epi32(_mm_add_epi32(v_r1, v_r2), 2); 466 467 _mm_storel_epi64((__m128i *)(dst0 + x), 468 _mm_packus_epi32(_mm_srli_epi32(_mm_add_epi32(v_dst0, v_delta), 6), v_zero)); 469 _mm_storel_epi64((__m128i *)(dst1 + x), 470 _mm_packus_epi32(_mm_srli_epi32(_mm_add_epi32(v_dst1, v_delta), 6), v_zero)); 471 } 472 473 return x; 474 } 475 }; 476 477 #else 478 479 typedef PyrUpNoVec<int, ushort> PyrUpVec_32s16u; 480 481 #endif // CV_SSE4_1 482 483 struct PyrUpVec_32f 484 { 485 int operator()(float** src, float** dst, int, int width) const 486 { 487 int x = 0; 488 489 if (!checkHardwareSupport(CV_CPU_SSE2)) 490 return x; 491 492 const float *row0 = src[0], *row1 = src[1], *row2 = src[2]; 493 float *dst0 = dst[0], *dst1 = dst[1]; 494 __m128 v_6 = _mm_set1_ps(6.0f), v_scale = _mm_set1_ps(1.f/64.0f), 495 v_scale4 = _mm_mul_ps(v_scale, _mm_set1_ps(4.0f)); 496 497 for( ; x <= width - 8; x += 8 ) 498 { 499 __m128 v_r0 = _mm_loadu_ps(row0 + x); 500 __m128 v_r1 = _mm_loadu_ps(row1 + x); 501 __m128 v_r2 = _mm_loadu_ps(row2 + x); 502 503 _mm_storeu_ps(dst1 + x, _mm_mul_ps(v_scale4, _mm_add_ps(v_r1, v_r2))); 504 _mm_storeu_ps(dst0 + x, _mm_mul_ps(v_scale, _mm_add_ps(_mm_add_ps(v_r0, _mm_mul_ps(v_6, v_r1)), v_r2))); 505 506 v_r0 = _mm_loadu_ps(row0 + x + 4); 507 v_r1 = _mm_loadu_ps(row1 + x + 4); 508 v_r2 = _mm_loadu_ps(row2 + x + 4); 509 510 _mm_storeu_ps(dst1 + x + 4, _mm_mul_ps(v_scale4, _mm_add_ps(v_r1, v_r2))); 511 _mm_storeu_ps(dst0 + x + 4, _mm_mul_ps(v_scale, _mm_add_ps(_mm_add_ps(v_r0, _mm_mul_ps(v_6, v_r1)), v_r2))); 512 } 513 514 return x; 515 } 516 }; 517 518 #elif CV_NEON 519 520 struct PyrDownVec_32s8u 521 { 522 int operator()(int** src, uchar* dst, int, int width) const 523 { 524 int x = 0; 525 const unsigned int *row0 = (unsigned int*)src[0], *row1 = (unsigned int*)src[1], 526 *row2 = (unsigned int*)src[2], *row3 = (unsigned int*)src[3], 527 *row4 = (unsigned int*)src[4]; 528 uint16x8_t v_delta = vdupq_n_u16(128); 529 530 for( ; x <= width - 16; x += 16 ) 531 { 532 uint16x8_t v_r0 = vcombine_u16(vqmovn_u32(vld1q_u32(row0 + x)), vqmovn_u32(vld1q_u32(row0 + x + 4))); 533 uint16x8_t v_r1 = vcombine_u16(vqmovn_u32(vld1q_u32(row1 + x)), vqmovn_u32(vld1q_u32(row1 + x + 4))); 534 uint16x8_t v_r2 = vcombine_u16(vqmovn_u32(vld1q_u32(row2 + x)), vqmovn_u32(vld1q_u32(row2 + x + 4))); 535 uint16x8_t v_r3 = vcombine_u16(vqmovn_u32(vld1q_u32(row3 + x)), vqmovn_u32(vld1q_u32(row3 + x + 4))); 536 uint16x8_t v_r4 = vcombine_u16(vqmovn_u32(vld1q_u32(row4 + x)), vqmovn_u32(vld1q_u32(row4 + x + 4))); 537 538 v_r0 = vaddq_u16(vaddq_u16(v_r0, v_r4), vaddq_u16(v_r2, v_r2)); 539 v_r1 = vaddq_u16(vaddq_u16(v_r1, v_r2), v_r3); 540 uint16x8_t v_dst0 = vaddq_u16(v_r0, vshlq_n_u16(v_r1, 2)); 541 542 v_r0 = vcombine_u16(vqmovn_u32(vld1q_u32(row0 + x + 8)), vqmovn_u32(vld1q_u32(row0 + x + 12))); 543 v_r1 = vcombine_u16(vqmovn_u32(vld1q_u32(row1 + x + 8)), vqmovn_u32(vld1q_u32(row1 + x + 12))); 544 v_r2 = vcombine_u16(vqmovn_u32(vld1q_u32(row2 + x + 8)), vqmovn_u32(vld1q_u32(row2 + x + 12))); 545 v_r3 = vcombine_u16(vqmovn_u32(vld1q_u32(row3 + x + 8)), vqmovn_u32(vld1q_u32(row3 + x + 12))); 546 v_r4 = vcombine_u16(vqmovn_u32(vld1q_u32(row4 + x + 8)), vqmovn_u32(vld1q_u32(row4 + x + 12))); 547 548 v_r0 = vaddq_u16(vaddq_u16(v_r0, v_r4), vaddq_u16(v_r2, v_r2)); 549 v_r1 = vaddq_u16(vaddq_u16(v_r1, v_r2), v_r3); 550 uint16x8_t v_dst1 = vaddq_u16(v_r0, vshlq_n_u16(v_r1, 2)); 551 552 vst1q_u8(dst + x, vcombine_u8(vqmovn_u16(vshrq_n_u16(vaddq_u16(v_dst0, v_delta), 8)), 553 vqmovn_u16(vshrq_n_u16(vaddq_u16(v_dst1, v_delta), 8)))); 554 } 555 556 return x; 557 } 558 }; 559 560 struct PyrDownVec_32s16u 561 { 562 int operator()(int** src, ushort* dst, int, int width) const 563 { 564 int x = 0; 565 const int *row0 = src[0], *row1 = src[1], *row2 = src[2], *row3 = src[3], *row4 = src[4]; 566 int32x4_t v_delta = vdupq_n_s32(128); 567 568 for( ; x <= width - 8; x += 8 ) 569 { 570 int32x4_t v_r00 = vld1q_s32(row0 + x), v_r01 = vld1q_s32(row0 + x + 4); 571 int32x4_t v_r10 = vld1q_s32(row1 + x), v_r11 = vld1q_s32(row1 + x + 4); 572 int32x4_t v_r20 = vld1q_s32(row2 + x), v_r21 = vld1q_s32(row2 + x + 4); 573 int32x4_t v_r30 = vld1q_s32(row3 + x), v_r31 = vld1q_s32(row3 + x + 4); 574 int32x4_t v_r40 = vld1q_s32(row4 + x), v_r41 = vld1q_s32(row4 + x + 4); 575 576 v_r00 = vaddq_s32(vaddq_s32(v_r00, v_r40), vaddq_s32(v_r20, v_r20)); 577 v_r10 = vaddq_s32(vaddq_s32(v_r10, v_r20), v_r30); 578 579 v_r10 = vshlq_n_s32(v_r10, 2); 580 int32x4_t v_dst0 = vshrq_n_s32(vaddq_s32(vaddq_s32(v_r00, v_r10), v_delta), 8); 581 582 v_r01 = vaddq_s32(vaddq_s32(v_r01, v_r41), vaddq_s32(v_r21, v_r21)); 583 v_r11 = vaddq_s32(vaddq_s32(v_r11, v_r21), v_r31); 584 v_r11 = vshlq_n_s32(v_r11, 2); 585 int32x4_t v_dst1 = vshrq_n_s32(vaddq_s32(vaddq_s32(v_r01, v_r11), v_delta), 8); 586 587 vst1q_u16(dst + x, vcombine_u16(vqmovun_s32(v_dst0), vqmovun_s32(v_dst1))); 588 } 589 590 return x; 591 } 592 }; 593 594 struct PyrDownVec_32s16s 595 { 596 int operator()(int** src, short* dst, int, int width) const 597 { 598 int x = 0; 599 const int *row0 = src[0], *row1 = src[1], *row2 = src[2], *row3 = src[3], *row4 = src[4]; 600 int32x4_t v_delta = vdupq_n_s32(128); 601 602 for( ; x <= width - 8; x += 8 ) 603 { 604 int32x4_t v_r00 = vld1q_s32(row0 + x), v_r01 = vld1q_s32(row0 + x + 4); 605 int32x4_t v_r10 = vld1q_s32(row1 + x), v_r11 = vld1q_s32(row1 + x + 4); 606 int32x4_t v_r20 = vld1q_s32(row2 + x), v_r21 = vld1q_s32(row2 + x + 4); 607 int32x4_t v_r30 = vld1q_s32(row3 + x), v_r31 = vld1q_s32(row3 + x + 4); 608 int32x4_t v_r40 = vld1q_s32(row4 + x), v_r41 = vld1q_s32(row4 + x + 4); 609 610 v_r00 = vaddq_s32(vaddq_s32(v_r00, v_r40), vaddq_s32(v_r20, v_r20)); 611 v_r10 = vaddq_s32(vaddq_s32(v_r10, v_r20), v_r30); 612 v_r10 = vshlq_n_s32(v_r10, 2); 613 int32x4_t v_dst0 = vshrq_n_s32(vaddq_s32(vaddq_s32(v_r00, v_r10), v_delta), 8); 614 615 v_r01 = vaddq_s32(vaddq_s32(v_r01, v_r41), vaddq_s32(v_r21, v_r21)); 616 v_r11 = vaddq_s32(vaddq_s32(v_r11, v_r21), v_r31); 617 v_r11 = vshlq_n_s32(v_r11, 2); 618 int32x4_t v_dst1 = vshrq_n_s32(vaddq_s32(vaddq_s32(v_r01, v_r11), v_delta), 8); 619 620 vst1q_s16(dst + x, vcombine_s16(vqmovn_s32(v_dst0), vqmovn_s32(v_dst1))); 621 } 622 623 return x; 624 } 625 }; 626 627 struct PyrDownVec_32f 628 { 629 int operator()(float** src, float* dst, int, int width) const 630 { 631 int x = 0; 632 const float *row0 = src[0], *row1 = src[1], *row2 = src[2], *row3 = src[3], *row4 = src[4]; 633 float32x4_t v_4 = vdupq_n_f32(4.0f), v_scale = vdupq_n_f32(1.f/256.0f); 634 635 for( ; x <= width - 8; x += 8 ) 636 { 637 float32x4_t v_r0 = vld1q_f32(row0 + x); 638 float32x4_t v_r1 = vld1q_f32(row1 + x); 639 float32x4_t v_r2 = vld1q_f32(row2 + x); 640 float32x4_t v_r3 = vld1q_f32(row3 + x); 641 float32x4_t v_r4 = vld1q_f32(row4 + x); 642 643 v_r0 = vaddq_f32(vaddq_f32(v_r0, v_r4), vaddq_f32(v_r2, v_r2)); 644 v_r1 = vaddq_f32(vaddq_f32(v_r1, v_r2), v_r3); 645 vst1q_f32(dst + x, vmulq_f32(vmlaq_f32(v_r0, v_4, v_r1), v_scale)); 646 647 v_r0 = vld1q_f32(row0 + x + 4); 648 v_r1 = vld1q_f32(row1 + x + 4); 649 v_r2 = vld1q_f32(row2 + x + 4); 650 v_r3 = vld1q_f32(row3 + x + 4); 651 v_r4 = vld1q_f32(row4 + x + 4); 652 653 v_r0 = vaddq_f32(vaddq_f32(v_r0, v_r4), vaddq_f32(v_r2, v_r2)); 654 v_r1 = vaddq_f32(vaddq_f32(v_r1, v_r2), v_r3); 655 vst1q_f32(dst + x + 4, vmulq_f32(vmlaq_f32(v_r0, v_4, v_r1), v_scale)); 656 } 657 658 return x; 659 } 660 }; 661 662 struct PyrUpVec_32s8u 663 { 664 int operator()(int** src, uchar** dst, int, int width) const 665 { 666 int x = 0; 667 uchar *dst0 = dst[0], *dst1 = dst[1]; 668 const uint *row0 = (uint *)src[0], *row1 = (uint *)src[1], *row2 = (uint *)src[2]; 669 uint16x8_t v_delta = vdupq_n_u16(32); 670 671 for( ; x <= width - 16; x += 16 ) 672 { 673 uint16x8_t v_r0 = vcombine_u16(vqmovn_u32(vld1q_u32(row0 + x)), vqmovn_u32(vld1q_u32(row0 + x + 4))); 674 uint16x8_t v_r1 = vcombine_u16(vqmovn_u32(vld1q_u32(row1 + x)), vqmovn_u32(vld1q_u32(row1 + x + 4))); 675 uint16x8_t v_r2 = vcombine_u16(vqmovn_u32(vld1q_u32(row2 + x)), vqmovn_u32(vld1q_u32(row2 + x + 4))); 676 677 uint16x8_t v_2r1 = vaddq_u16(v_r1, v_r1), v_4r1 = vaddq_u16(v_2r1, v_2r1); 678 uint16x8_t v_dst00 = vaddq_u16(vaddq_u16(v_r0, v_r2), vaddq_u16(v_2r1, v_4r1)); 679 uint16x8_t v_dst10 = vshlq_n_u16(vaddq_u16(v_r1, v_r2), 2); 680 681 v_r0 = vcombine_u16(vqmovn_u32(vld1q_u32(row0 + x + 8)), vqmovn_u32(vld1q_u32(row0 + x + 12))); 682 v_r1 = vcombine_u16(vqmovn_u32(vld1q_u32(row1 + x + 8)), vqmovn_u32(vld1q_u32(row1 + x + 12))); 683 v_r2 = vcombine_u16(vqmovn_u32(vld1q_u32(row2 + x + 8)), vqmovn_u32(vld1q_u32(row2 + x + 12))); 684 685 v_2r1 = vaddq_u16(v_r1, v_r1), v_4r1 = vaddq_u16(v_2r1, v_2r1); 686 uint16x8_t v_dst01 = vaddq_u16(vaddq_u16(v_r0, v_r2), vaddq_u16(v_2r1, v_4r1)); 687 uint16x8_t v_dst11 = vshlq_n_u16(vaddq_u16(v_r1, v_r2), 2); 688 689 vst1q_u8(dst0 + x, vcombine_u8(vqmovn_u16(vshrq_n_u16(vaddq_u16(v_dst00, v_delta), 6)), 690 vqmovn_u16(vshrq_n_u16(vaddq_u16(v_dst01, v_delta), 6)))); 691 vst1q_u8(dst1 + x, vcombine_u8(vqmovn_u16(vshrq_n_u16(vaddq_u16(v_dst10, v_delta), 6)), 692 vqmovn_u16(vshrq_n_u16(vaddq_u16(v_dst11, v_delta), 6)))); 693 } 694 695 for( ; x <= width - 8; x += 8 ) 696 { 697 uint16x8_t v_r0 = vcombine_u16(vqmovn_u32(vld1q_u32(row0 + x)), vqmovn_u32(vld1q_u32(row0 + x + 4))); 698 uint16x8_t v_r1 = vcombine_u16(vqmovn_u32(vld1q_u32(row1 + x)), vqmovn_u32(vld1q_u32(row1 + x + 4))); 699 uint16x8_t v_r2 = vcombine_u16(vqmovn_u32(vld1q_u32(row2 + x)), vqmovn_u32(vld1q_u32(row2 + x + 4))); 700 701 uint16x8_t v_2r1 = vaddq_u16(v_r1, v_r1), v_4r1 = vaddq_u16(v_2r1, v_2r1); 702 uint16x8_t v_dst0 = vaddq_u16(vaddq_u16(v_r0, v_r2), vaddq_u16(v_2r1, v_4r1)); 703 uint16x8_t v_dst1 = vshlq_n_u16(vaddq_u16(v_r1, v_r2), 2); 704 705 vst1_u8(dst0 + x, vqmovn_u16(vshrq_n_u16(vaddq_u16(v_dst0, v_delta), 6))); 706 vst1_u8(dst1 + x, vqmovn_u16(vshrq_n_u16(vaddq_u16(v_dst1, v_delta), 6))); 707 } 708 709 return x; 710 } 711 }; 712 713 struct PyrUpVec_32s16u 714 { 715 int operator()(int** src, ushort** dst, int, int width) const 716 { 717 int x = 0; 718 ushort *dst0 = dst[0], *dst1 = dst[1]; 719 const uint *row0 = (uint *)src[0], *row1 = (uint *)src[1], *row2 = (uint *)src[2]; 720 uint32x4_t v_delta = vdupq_n_u32(32); 721 722 for( ; x <= width - 8; x += 8 ) 723 { 724 uint32x4_t v_r0 = vld1q_u32(row0 + x), v_r1 = vld1q_u32(row1 + x), v_r2 = vld1q_u32(row2 + x); 725 uint32x4_t v_2r1 = vshlq_n_u32(v_r1, 1), v_4r1 = vshlq_n_u32(v_r1, 2); 726 uint32x4_t v_dst00 = vaddq_u32(vaddq_u32(v_r0, v_r2), vaddq_u32(v_2r1, v_4r1)); 727 uint32x4_t v_dst10 = vshlq_n_u32(vaddq_u32(v_r1, v_r2), 2); 728 729 v_r0 = vld1q_u32(row0 + x + 4); 730 v_r1 = vld1q_u32(row1 + x + 4); 731 v_r2 = vld1q_u32(row2 + x + 4); 732 v_2r1 = vshlq_n_u32(v_r1, 1); 733 v_4r1 = vshlq_n_u32(v_r1, 2); 734 uint32x4_t v_dst01 = vaddq_u32(vaddq_u32(v_r0, v_r2), vaddq_u32(v_2r1, v_4r1)); 735 uint32x4_t v_dst11 = vshlq_n_u32(vaddq_u32(v_r1, v_r2), 2); 736 737 vst1q_u16(dst0 + x, vcombine_u16(vmovn_u32(vshrq_n_u32(vaddq_u32(v_dst00, v_delta), 6)), 738 vmovn_u32(vshrq_n_u32(vaddq_u32(v_dst01, v_delta), 6)))); 739 vst1q_u16(dst1 + x, vcombine_u16(vmovn_u32(vshrq_n_u32(vaddq_u32(v_dst10, v_delta), 6)), 740 vmovn_u32(vshrq_n_u32(vaddq_u32(v_dst11, v_delta), 6)))); 741 } 742 743 for( ; x <= width - 4; x += 4 ) 744 { 745 uint32x4_t v_r0 = vld1q_u32(row0 + x), v_r1 = vld1q_u32(row1 + x), v_r2 = vld1q_u32(row2 + x); 746 uint32x4_t v_2r1 = vshlq_n_u32(v_r1, 1), v_4r1 = vshlq_n_u32(v_r1, 2); 747 748 uint32x4_t v_dst0 = vaddq_u32(vaddq_u32(v_r0, v_r2), vaddq_u32(v_2r1, v_4r1)); 749 uint32x4_t v_dst1 = vshlq_n_u32(vaddq_u32(v_r1, v_r2), 2); 750 751 vst1_u16(dst0 + x, vmovn_u32(vshrq_n_u32(vaddq_u32(v_dst0, v_delta), 6))); 752 vst1_u16(dst1 + x, vmovn_u32(vshrq_n_u32(vaddq_u32(v_dst1, v_delta), 6))); 753 } 754 755 return x; 756 } 757 }; 758 759 struct PyrUpVec_32s16s 760 { 761 int operator()(int** src, short** dst, int, int width) const 762 { 763 int x = 0; 764 short *dst0 = dst[0], *dst1 = dst[1]; 765 const int *row0 = src[0], *row1 = src[1], *row2 = src[2]; 766 int32x4_t v_delta = vdupq_n_s32(32); 767 768 for( ; x <= width - 8; x += 8 ) 769 { 770 int32x4_t v_r0 = vld1q_s32(row0 + x), v_r1 = vld1q_s32(row1 + x), v_r2 = vld1q_s32(row2 + x); 771 int32x4_t v_2r1 = vshlq_n_s32(v_r1, 1), v_4r1 = vshlq_n_s32(v_r1, 2); 772 int32x4_t v_dst00 = vaddq_s32(vaddq_s32(v_r0, v_r2), vaddq_s32(v_2r1, v_4r1)); 773 int32x4_t v_dst10 = vshlq_n_s32(vaddq_s32(v_r1, v_r2), 2); 774 775 v_r0 = vld1q_s32(row0 + x + 4); 776 v_r1 = vld1q_s32(row1 + x + 4); 777 v_r2 = vld1q_s32(row2 + x + 4); 778 v_2r1 = vshlq_n_s32(v_r1, 1); 779 v_4r1 = vshlq_n_s32(v_r1, 2); 780 int32x4_t v_dst01 = vaddq_s32(vaddq_s32(v_r0, v_r2), vaddq_s32(v_2r1, v_4r1)); 781 int32x4_t v_dst11 = vshlq_n_s32(vaddq_s32(v_r1, v_r2), 2); 782 783 vst1q_s16(dst0 + x, vcombine_s16(vqmovn_s32(vshrq_n_s32(vaddq_s32(v_dst00, v_delta), 6)), 784 vqmovn_s32(vshrq_n_s32(vaddq_s32(v_dst01, v_delta), 6)))); 785 vst1q_s16(dst1 + x, vcombine_s16(vqmovn_s32(vshrq_n_s32(vaddq_s32(v_dst10, v_delta), 6)), 786 vqmovn_s32(vshrq_n_s32(vaddq_s32(v_dst11, v_delta), 6)))); 787 } 788 789 for( ; x <= width - 4; x += 4 ) 790 { 791 int32x4_t v_r0 = vld1q_s32(row0 + x), v_r1 = vld1q_s32(row1 + x), v_r2 = vld1q_s32(row2 + x); 792 int32x4_t v_2r1 = vshlq_n_s32(v_r1, 1), v_4r1 = vshlq_n_s32(v_r1, 2); 793 794 int32x4_t v_dst0 = vaddq_s32(vaddq_s32(v_r0, v_r2), vaddq_s32(v_2r1, v_4r1)); 795 int32x4_t v_dst1 = vshlq_n_s32(vaddq_s32(v_r1, v_r2), 2); 796 797 vst1_s16(dst0 + x, vqmovn_s32(vshrq_n_s32(vaddq_s32(v_dst0, v_delta), 6))); 798 vst1_s16(dst1 + x, vqmovn_s32(vshrq_n_s32(vaddq_s32(v_dst1, v_delta), 6))); 799 } 800 801 return x; 802 } 803 }; 804 805 struct PyrUpVec_32f 806 { 807 int operator()(float** src, float** dst, int, int width) const 808 { 809 int x = 0; 810 const float *row0 = src[0], *row1 = src[1], *row2 = src[2]; 811 float *dst0 = dst[0], *dst1 = dst[1]; 812 float32x4_t v_6 = vdupq_n_f32(6.0f), v_scale = vdupq_n_f32(1.f/64.0f), v_scale4 = vmulq_n_f32(v_scale, 4.0f); 813 814 for( ; x <= width - 8; x += 8 ) 815 { 816 float32x4_t v_r0 = vld1q_f32(row0 + x); 817 float32x4_t v_r1 = vld1q_f32(row1 + x); 818 float32x4_t v_r2 = vld1q_f32(row2 + x); 819 820 vst1q_f32(dst1 + x, vmulq_f32(v_scale4, vaddq_f32(v_r1, v_r2))); 821 vst1q_f32(dst0 + x, vmulq_f32(v_scale, vaddq_f32(vmlaq_f32(v_r0, v_6, v_r1), v_r2))); 822 823 v_r0 = vld1q_f32(row0 + x + 4); 824 v_r1 = vld1q_f32(row1 + x + 4); 825 v_r2 = vld1q_f32(row2 + x + 4); 826 827 vst1q_f32(dst1 + x + 4, vmulq_f32(v_scale4, vaddq_f32(v_r1, v_r2))); 828 vst1q_f32(dst0 + x + 4, vmulq_f32(v_scale, vaddq_f32(vmlaq_f32(v_r0, v_6, v_r1), v_r2))); 829 } 830 831 return x; 832 } 833 }; 834 835 #else 836 837 typedef PyrDownNoVec<int, uchar> PyrDownVec_32s8u; 838 typedef PyrDownNoVec<int, ushort> PyrDownVec_32s16u; 839 typedef PyrDownNoVec<int, short> PyrDownVec_32s16s; 840 typedef PyrDownNoVec<float, float> PyrDownVec_32f; 841 842 typedef PyrUpNoVec<int, uchar> PyrUpVec_32s8u; 843 typedef PyrUpNoVec<int, short> PyrUpVec_32s16s; 844 typedef PyrUpNoVec<int, ushort> PyrUpVec_32s16u; 845 typedef PyrUpNoVec<float, float> PyrUpVec_32f; 846 847 #endif 848 849 template<class CastOp, class VecOp> void 850 pyrDown_( const Mat& _src, Mat& _dst, int borderType ) 851 { 852 const int PD_SZ = 5; 853 typedef typename CastOp::type1 WT; 854 typedef typename CastOp::rtype T; 855 856 CV_Assert( !_src.empty() ); 857 Size ssize = _src.size(), dsize = _dst.size(); 858 int cn = _src.channels(); 859 int bufstep = (int)alignSize(dsize.width*cn, 16); 860 AutoBuffer<WT> _buf(bufstep*PD_SZ + 16); 861 WT* buf = alignPtr((WT*)_buf, 16); 862 int tabL[CV_CN_MAX*(PD_SZ+2)], tabR[CV_CN_MAX*(PD_SZ+2)]; 863 AutoBuffer<int> _tabM(dsize.width*cn); 864 int* tabM = _tabM; 865 WT* rows[PD_SZ]; 866 CastOp castOp; 867 VecOp vecOp; 868 869 CV_Assert( ssize.width > 0 && ssize.height > 0 && 870 std::abs(dsize.width*2 - ssize.width) <= 2 && 871 std::abs(dsize.height*2 - ssize.height) <= 2 ); 872 int k, x, sy0 = -PD_SZ/2, sy = sy0, width0 = std::min((ssize.width-PD_SZ/2-1)/2 + 1, dsize.width); 873 874 for( x = 0; x <= PD_SZ+1; x++ ) 875 { 876 int sx0 = borderInterpolate(x - PD_SZ/2, ssize.width, borderType)*cn; 877 int sx1 = borderInterpolate(x + width0*2 - PD_SZ/2, ssize.width, borderType)*cn; 878 for( k = 0; k < cn; k++ ) 879 { 880 tabL[x*cn + k] = sx0 + k; 881 tabR[x*cn + k] = sx1 + k; 882 } 883 } 884 885 ssize.width *= cn; 886 dsize.width *= cn; 887 width0 *= cn; 888 889 for( x = 0; x < dsize.width; x++ ) 890 tabM[x] = (x/cn)*2*cn + x % cn; 891 892 for( int y = 0; y < dsize.height; y++ ) 893 { 894 T* dst = _dst.ptr<T>(y); 895 WT *row0, *row1, *row2, *row3, *row4; 896 897 // fill the ring buffer (horizontal convolution and decimation) 898 for( ; sy <= y*2 + 2; sy++ ) 899 { 900 WT* row = buf + ((sy - sy0) % PD_SZ)*bufstep; 901 int _sy = borderInterpolate(sy, ssize.height, borderType); 902 const T* src = _src.ptr<T>(_sy); 903 int limit = cn; 904 const int* tab = tabL; 905 906 for( x = 0;;) 907 { 908 for( ; x < limit; x++ ) 909 { 910 row[x] = src[tab[x+cn*2]]*6 + (src[tab[x+cn]] + src[tab[x+cn*3]])*4 + 911 src[tab[x]] + src[tab[x+cn*4]]; 912 } 913 914 if( x == dsize.width ) 915 break; 916 917 if( cn == 1 ) 918 { 919 for( ; x < width0; x++ ) 920 row[x] = src[x*2]*6 + (src[x*2 - 1] + src[x*2 + 1])*4 + 921 src[x*2 - 2] + src[x*2 + 2]; 922 } 923 else if( cn == 3 ) 924 { 925 for( ; x < width0; x += 3 ) 926 { 927 const T* s = src + x*2; 928 WT t0 = s[0]*6 + (s[-3] + s[3])*4 + s[-6] + s[6]; 929 WT t1 = s[1]*6 + (s[-2] + s[4])*4 + s[-5] + s[7]; 930 WT t2 = s[2]*6 + (s[-1] + s[5])*4 + s[-4] + s[8]; 931 row[x] = t0; row[x+1] = t1; row[x+2] = t2; 932 } 933 } 934 else if( cn == 4 ) 935 { 936 for( ; x < width0; x += 4 ) 937 { 938 const T* s = src + x*2; 939 WT t0 = s[0]*6 + (s[-4] + s[4])*4 + s[-8] + s[8]; 940 WT t1 = s[1]*6 + (s[-3] + s[5])*4 + s[-7] + s[9]; 941 row[x] = t0; row[x+1] = t1; 942 t0 = s[2]*6 + (s[-2] + s[6])*4 + s[-6] + s[10]; 943 t1 = s[3]*6 + (s[-1] + s[7])*4 + s[-5] + s[11]; 944 row[x+2] = t0; row[x+3] = t1; 945 } 946 } 947 else 948 { 949 for( ; x < width0; x++ ) 950 { 951 int sx = tabM[x]; 952 row[x] = src[sx]*6 + (src[sx - cn] + src[sx + cn])*4 + 953 src[sx - cn*2] + src[sx + cn*2]; 954 } 955 } 956 957 limit = dsize.width; 958 tab = tabR - x; 959 } 960 } 961 962 // do vertical convolution and decimation and write the result to the destination image 963 for( k = 0; k < PD_SZ; k++ ) 964 rows[k] = buf + ((y*2 - PD_SZ/2 + k - sy0) % PD_SZ)*bufstep; 965 row0 = rows[0]; row1 = rows[1]; row2 = rows[2]; row3 = rows[3]; row4 = rows[4]; 966 967 x = vecOp(rows, dst, (int)_dst.step, dsize.width); 968 for( ; x < dsize.width; x++ ) 969 dst[x] = castOp(row2[x]*6 + (row1[x] + row3[x])*4 + row0[x] + row4[x]); 970 } 971 } 972 973 974 template<class CastOp, class VecOp> void 975 pyrUp_( const Mat& _src, Mat& _dst, int) 976 { 977 const int PU_SZ = 3; 978 typedef typename CastOp::type1 WT; 979 typedef typename CastOp::rtype T; 980 981 Size ssize = _src.size(), dsize = _dst.size(); 982 int cn = _src.channels(); 983 int bufstep = (int)alignSize((dsize.width+1)*cn, 16); 984 AutoBuffer<WT> _buf(bufstep*PU_SZ + 16); 985 WT* buf = alignPtr((WT*)_buf, 16); 986 AutoBuffer<int> _dtab(ssize.width*cn); 987 int* dtab = _dtab; 988 WT* rows[PU_SZ]; 989 T* dsts[2]; 990 CastOp castOp; 991 VecOp vecOp; 992 993 CV_Assert( std::abs(dsize.width - ssize.width*2) == dsize.width % 2 && 994 std::abs(dsize.height - ssize.height*2) == dsize.height % 2); 995 int k, x, sy0 = -PU_SZ/2, sy = sy0; 996 997 ssize.width *= cn; 998 dsize.width *= cn; 999 1000 for( x = 0; x < ssize.width; x++ ) 1001 dtab[x] = (x/cn)*2*cn + x % cn; 1002 1003 for( int y = 0; y < ssize.height; y++ ) 1004 { 1005 T* dst0 = _dst.ptr<T>(y*2); 1006 T* dst1 = _dst.ptr<T>(std::min(y*2+1, dsize.height-1)); 1007 WT *row0, *row1, *row2; 1008 1009 // fill the ring buffer (horizontal convolution and decimation) 1010 for( ; sy <= y + 1; sy++ ) 1011 { 1012 WT* row = buf + ((sy - sy0) % PU_SZ)*bufstep; 1013 int _sy = borderInterpolate(sy*2, dsize.height, BORDER_REFLECT_101)/2; 1014 const T* src = _src.ptr<T>(_sy); 1015 1016 if( ssize.width == cn ) 1017 { 1018 for( x = 0; x < cn; x++ ) 1019 row[x] = row[x + cn] = src[x]*8; 1020 continue; 1021 } 1022 1023 for( x = 0; x < cn; x++ ) 1024 { 1025 int dx = dtab[x]; 1026 WT t0 = src[x]*6 + src[x + cn]*2; 1027 WT t1 = (src[x] + src[x + cn])*4; 1028 row[dx] = t0; row[dx + cn] = t1; 1029 dx = dtab[ssize.width - cn + x]; 1030 int sx = ssize.width - cn + x; 1031 t0 = src[sx - cn] + src[sx]*7; 1032 t1 = src[sx]*8; 1033 row[dx] = t0; row[dx + cn] = t1; 1034 } 1035 1036 for( x = cn; x < ssize.width - cn; x++ ) 1037 { 1038 int dx = dtab[x]; 1039 WT t0 = src[x-cn] + src[x]*6 + src[x+cn]; 1040 WT t1 = (src[x] + src[x+cn])*4; 1041 row[dx] = t0; 1042 row[dx+cn] = t1; 1043 } 1044 } 1045 1046 // do vertical convolution and decimation and write the result to the destination image 1047 for( k = 0; k < PU_SZ; k++ ) 1048 rows[k] = buf + ((y - PU_SZ/2 + k - sy0) % PU_SZ)*bufstep; 1049 row0 = rows[0]; row1 = rows[1]; row2 = rows[2]; 1050 dsts[0] = dst0; dsts[1] = dst1; 1051 1052 x = vecOp(rows, dsts, (int)_dst.step, dsize.width); 1053 for( ; x < dsize.width; x++ ) 1054 { 1055 T t1 = castOp((row1[x] + row2[x])*4); 1056 T t0 = castOp(row0[x] + row1[x]*6 + row2[x]); 1057 dst1[x] = t1; dst0[x] = t0; 1058 } 1059 } 1060 } 1061 1062 typedef void (*PyrFunc)(const Mat&, Mat&, int); 1063 1064 #ifdef HAVE_OPENCL 1065 1066 static bool ocl_pyrDown( InputArray _src, OutputArray _dst, const Size& _dsz, int borderType) 1067 { 1068 int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type); 1069 1070 bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0; 1071 if (cn > 4 || (depth == CV_64F && !doubleSupport)) 1072 return false; 1073 1074 Size ssize = _src.size(); 1075 Size dsize = _dsz.area() == 0 ? Size((ssize.width + 1) / 2, (ssize.height + 1) / 2) : _dsz; 1076 if (dsize.height < 2 || dsize.width < 2) 1077 return false; 1078 1079 CV_Assert( ssize.width > 0 && ssize.height > 0 && 1080 std::abs(dsize.width*2 - ssize.width) <= 2 && 1081 std::abs(dsize.height*2 - ssize.height) <= 2 ); 1082 1083 UMat src = _src.getUMat(); 1084 _dst.create( dsize, src.type() ); 1085 UMat dst = _dst.getUMat(); 1086 1087 int float_depth = depth == CV_64F ? CV_64F : CV_32F; 1088 const int local_size = 256; 1089 int kercn = 1; 1090 if (depth == CV_8U && float_depth == CV_32F && cn == 1 && ocl::Device::getDefault().isIntel()) 1091 kercn = 4; 1092 const char * const borderMap[] = { "BORDER_CONSTANT", "BORDER_REPLICATE", "BORDER_REFLECT", "BORDER_WRAP", 1093 "BORDER_REFLECT_101" }; 1094 char cvt[2][50]; 1095 String buildOptions = format( 1096 "-D T=%s -D FT=%s -D convertToT=%s -D convertToFT=%s%s " 1097 "-D T1=%s -D cn=%d -D kercn=%d -D fdepth=%d -D %s -D LOCAL_SIZE=%d", 1098 ocl::typeToStr(type), ocl::typeToStr(CV_MAKETYPE(float_depth, cn)), 1099 ocl::convertTypeStr(float_depth, depth, cn, cvt[0]), 1100 ocl::convertTypeStr(depth, float_depth, cn, cvt[1]), 1101 doubleSupport ? " -D DOUBLE_SUPPORT" : "", ocl::typeToStr(depth), 1102 cn, kercn, float_depth, borderMap[borderType], local_size 1103 ); 1104 ocl::Kernel k("pyrDown", ocl::imgproc::pyr_down_oclsrc, buildOptions); 1105 if (k.empty()) 1106 return false; 1107 1108 k.args(ocl::KernelArg::ReadOnly(src), ocl::KernelArg::WriteOnly(dst)); 1109 1110 size_t localThreads[2] = { local_size/kercn, 1 }; 1111 size_t globalThreads[2] = { (src.cols + (kercn-1))/kercn, (dst.rows + 1) / 2 }; 1112 return k.run(2, globalThreads, localThreads, false); 1113 } 1114 1115 static bool ocl_pyrUp( InputArray _src, OutputArray _dst, const Size& _dsz, int borderType) 1116 { 1117 int type = _src.type(), depth = CV_MAT_DEPTH(type), channels = CV_MAT_CN(type); 1118 1119 if (channels > 4 || borderType != BORDER_DEFAULT) 1120 return false; 1121 1122 bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0; 1123 if (depth == CV_64F && !doubleSupport) 1124 return false; 1125 1126 Size ssize = _src.size(); 1127 if ((_dsz.area() != 0) && (_dsz != Size(ssize.width * 2, ssize.height * 2))) 1128 return false; 1129 1130 UMat src = _src.getUMat(); 1131 Size dsize = Size(ssize.width * 2, ssize.height * 2); 1132 _dst.create( dsize, src.type() ); 1133 UMat dst = _dst.getUMat(); 1134 1135 int float_depth = depth == CV_64F ? CV_64F : CV_32F; 1136 const int local_size = 16; 1137 char cvt[2][50]; 1138 String buildOptions = format( 1139 "-D T=%s -D FT=%s -D convertToT=%s -D convertToFT=%s%s " 1140 "-D T1=%s -D cn=%d -D LOCAL_SIZE=%d", 1141 ocl::typeToStr(type), ocl::typeToStr(CV_MAKETYPE(float_depth, channels)), 1142 ocl::convertTypeStr(float_depth, depth, channels, cvt[0]), 1143 ocl::convertTypeStr(depth, float_depth, channels, cvt[1]), 1144 doubleSupport ? " -D DOUBLE_SUPPORT" : "", 1145 ocl::typeToStr(depth), channels, local_size 1146 ); 1147 size_t globalThreads[2] = { dst.cols, dst.rows }; 1148 size_t localThreads[2] = { local_size, local_size }; 1149 ocl::Kernel k; 1150 if (ocl::Device::getDefault().isIntel() && channels == 1) 1151 { 1152 k.create("pyrUp_unrolled", ocl::imgproc::pyr_up_oclsrc, buildOptions); 1153 globalThreads[0] = dst.cols/2; globalThreads[1] = dst.rows/2; 1154 } 1155 else 1156 k.create("pyrUp", ocl::imgproc::pyr_up_oclsrc, buildOptions); 1157 1158 if (k.empty()) 1159 return false; 1160 1161 k.args(ocl::KernelArg::ReadOnly(src), ocl::KernelArg::WriteOnly(dst)); 1162 return k.run(2, globalThreads, localThreads, false); 1163 } 1164 1165 #endif 1166 1167 } 1168 1169 void cv::pyrDown( InputArray _src, OutputArray _dst, const Size& _dsz, int borderType ) 1170 { 1171 CV_Assert(borderType != BORDER_CONSTANT); 1172 1173 CV_OCL_RUN(_src.dims() <= 2 && _dst.isUMat(), 1174 ocl_pyrDown(_src, _dst, _dsz, borderType)) 1175 1176 Mat src = _src.getMat(); 1177 Size dsz = _dsz.area() == 0 ? Size((src.cols + 1)/2, (src.rows + 1)/2) : _dsz; 1178 _dst.create( dsz, src.type() ); 1179 Mat dst = _dst.getMat(); 1180 int depth = src.depth(); 1181 1182 #ifdef HAVE_TEGRA_OPTIMIZATION 1183 if(borderType == BORDER_DEFAULT && tegra::useTegra() && tegra::pyrDown(src, dst)) 1184 return; 1185 #endif 1186 1187 #if IPP_VERSION_X100 >= 801 && 0 1188 CV_IPP_CHECK() 1189 { 1190 bool isolated = (borderType & BORDER_ISOLATED) != 0; 1191 int borderTypeNI = borderType & ~BORDER_ISOLATED; 1192 if (borderTypeNI == BORDER_DEFAULT && (!src.isSubmatrix() || isolated) && dsz == Size((src.cols + 1)/2, (src.rows + 1)/2)) 1193 { 1194 typedef IppStatus (CV_STDCALL * ippiPyrDown)(const void* pSrc, int srcStep, void* pDst, int dstStep, IppiSize srcRoi, Ipp8u* buffer); 1195 int type = src.type(); 1196 CV_SUPPRESS_DEPRECATED_START 1197 ippiPyrDown pyrDownFunc = type == CV_8UC1 ? (ippiPyrDown) ippiPyrDown_Gauss5x5_8u_C1R : 1198 type == CV_8UC3 ? (ippiPyrDown) ippiPyrDown_Gauss5x5_8u_C3R : 1199 type == CV_32FC1 ? (ippiPyrDown) ippiPyrDown_Gauss5x5_32f_C1R : 1200 type == CV_32FC3 ? (ippiPyrDown) ippiPyrDown_Gauss5x5_32f_C3R : 0; 1201 CV_SUPPRESS_DEPRECATED_END 1202 1203 if (pyrDownFunc) 1204 { 1205 int bufferSize; 1206 IppiSize srcRoi = { src.cols, src.rows }; 1207 IppDataType dataType = depth == CV_8U ? ipp8u : ipp32f; 1208 CV_SUPPRESS_DEPRECATED_START 1209 IppStatus ok = ippiPyrDownGetBufSize_Gauss5x5(srcRoi.width, dataType, src.channels(), &bufferSize); 1210 CV_SUPPRESS_DEPRECATED_END 1211 if (ok >= 0) 1212 { 1213 Ipp8u* buffer = ippsMalloc_8u(bufferSize); 1214 ok = pyrDownFunc(src.data, (int) src.step, dst.data, (int) dst.step, srcRoi, buffer); 1215 ippsFree(buffer); 1216 1217 if (ok >= 0) 1218 { 1219 CV_IMPL_ADD(CV_IMPL_IPP); 1220 return; 1221 } 1222 setIppErrorStatus(); 1223 } 1224 } 1225 } 1226 } 1227 #endif 1228 1229 PyrFunc func = 0; 1230 if( depth == CV_8U ) 1231 func = pyrDown_<FixPtCast<uchar, 8>, PyrDownVec_32s8u>; 1232 else if( depth == CV_16S ) 1233 func = pyrDown_<FixPtCast<short, 8>, PyrDownVec_32s16s >; 1234 else if( depth == CV_16U ) 1235 func = pyrDown_<FixPtCast<ushort, 8>, PyrDownVec_32s16u >; 1236 else if( depth == CV_32F ) 1237 func = pyrDown_<FltCast<float, 8>, PyrDownVec_32f>; 1238 else if( depth == CV_64F ) 1239 func = pyrDown_<FltCast<double, 8>, PyrDownNoVec<double, double> >; 1240 else 1241 CV_Error( CV_StsUnsupportedFormat, "" ); 1242 1243 func( src, dst, borderType ); 1244 } 1245 1246 void cv::pyrUp( InputArray _src, OutputArray _dst, const Size& _dsz, int borderType ) 1247 { 1248 CV_Assert(borderType == BORDER_DEFAULT); 1249 1250 CV_OCL_RUN(_src.dims() <= 2 && _dst.isUMat(), 1251 ocl_pyrUp(_src, _dst, _dsz, borderType)) 1252 1253 Mat src = _src.getMat(); 1254 Size dsz = _dsz.area() == 0 ? Size(src.cols*2, src.rows*2) : _dsz; 1255 _dst.create( dsz, src.type() ); 1256 Mat dst = _dst.getMat(); 1257 int depth = src.depth(); 1258 1259 #ifdef HAVE_TEGRA_OPTIMIZATION 1260 if(borderType == BORDER_DEFAULT && tegra::useTegra() && tegra::pyrUp(src, dst)) 1261 return; 1262 #endif 1263 1264 #if IPP_VERSION_X100 >= 801 && 0 1265 CV_IPP_CHECK() 1266 { 1267 bool isolated = (borderType & BORDER_ISOLATED) != 0; 1268 int borderTypeNI = borderType & ~BORDER_ISOLATED; 1269 if (borderTypeNI == BORDER_DEFAULT && (!src.isSubmatrix() || isolated) && dsz == Size(src.cols*2, src.rows*2)) 1270 { 1271 typedef IppStatus (CV_STDCALL * ippiPyrUp)(const void* pSrc, int srcStep, void* pDst, int dstStep, IppiSize srcRoi, Ipp8u* buffer); 1272 int type = src.type(); 1273 CV_SUPPRESS_DEPRECATED_START 1274 ippiPyrUp pyrUpFunc = type == CV_8UC1 ? (ippiPyrUp) ippiPyrUp_Gauss5x5_8u_C1R : 1275 type == CV_8UC3 ? (ippiPyrUp) ippiPyrUp_Gauss5x5_8u_C3R : 1276 type == CV_32FC1 ? (ippiPyrUp) ippiPyrUp_Gauss5x5_32f_C1R : 1277 type == CV_32FC3 ? (ippiPyrUp) ippiPyrUp_Gauss5x5_32f_C3R : 0; 1278 CV_SUPPRESS_DEPRECATED_END 1279 1280 if (pyrUpFunc) 1281 { 1282 int bufferSize; 1283 IppiSize srcRoi = { src.cols, src.rows }; 1284 IppDataType dataType = depth == CV_8U ? ipp8u : ipp32f; 1285 CV_SUPPRESS_DEPRECATED_START 1286 IppStatus ok = ippiPyrUpGetBufSize_Gauss5x5(srcRoi.width, dataType, src.channels(), &bufferSize); 1287 CV_SUPPRESS_DEPRECATED_END 1288 if (ok >= 0) 1289 { 1290 Ipp8u* buffer = ippsMalloc_8u(bufferSize); 1291 ok = pyrUpFunc(src.data, (int) src.step, dst.data, (int) dst.step, srcRoi, buffer); 1292 ippsFree(buffer); 1293 1294 if (ok >= 0) 1295 { 1296 CV_IMPL_ADD(CV_IMPL_IPP); 1297 return; 1298 } 1299 setIppErrorStatus(); 1300 } 1301 } 1302 } 1303 } 1304 #endif 1305 1306 PyrFunc func = 0; 1307 if( depth == CV_8U ) 1308 func = pyrUp_<FixPtCast<uchar, 6>, PyrUpVec_32s8u >; 1309 else if( depth == CV_16S ) 1310 func = pyrUp_<FixPtCast<short, 6>, PyrUpVec_32s16s >; 1311 else if( depth == CV_16U ) 1312 func = pyrUp_<FixPtCast<ushort, 6>, PyrUpVec_32s16u >; 1313 else if( depth == CV_32F ) 1314 func = pyrUp_<FltCast<float, 6>, PyrUpVec_32f >; 1315 else if( depth == CV_64F ) 1316 func = pyrUp_<FltCast<double, 6>, PyrUpNoVec<double, double> >; 1317 else 1318 CV_Error( CV_StsUnsupportedFormat, "" ); 1319 1320 func( src, dst, borderType ); 1321 } 1322 1323 void cv::buildPyramid( InputArray _src, OutputArrayOfArrays _dst, int maxlevel, int borderType ) 1324 { 1325 CV_Assert(borderType != BORDER_CONSTANT); 1326 1327 if (_src.dims() <= 2 && _dst.isUMatVector()) 1328 { 1329 UMat src = _src.getUMat(); 1330 _dst.create( maxlevel + 1, 1, 0 ); 1331 _dst.getUMatRef(0) = src; 1332 for( int i = 1; i <= maxlevel; i++ ) 1333 pyrDown( _dst.getUMatRef(i-1), _dst.getUMatRef(i), Size(), borderType ); 1334 return; 1335 } 1336 1337 Mat src = _src.getMat(); 1338 _dst.create( maxlevel + 1, 1, 0 ); 1339 _dst.getMatRef(0) = src; 1340 1341 int i=1; 1342 1343 #if IPP_VERSION_X100 >= 801 && 0 1344 CV_IPP_CHECK() 1345 { 1346 bool isolated = (borderType & BORDER_ISOLATED) != 0; 1347 int borderTypeNI = borderType & ~BORDER_ISOLATED; 1348 if (borderTypeNI == BORDER_DEFAULT && (!src.isSubmatrix() || isolated)) 1349 { 1350 typedef IppStatus (CV_STDCALL * ippiPyramidLayerDownInitAlloc)(void** ppState, IppiSize srcRoi, Ipp32f rate, void* pKernel, int kerSize, int mode); 1351 typedef IppStatus (CV_STDCALL * ippiPyramidLayerDown)(void* pSrc, int srcStep, IppiSize srcRoiSize, void* pDst, int dstStep, IppiSize dstRoiSize, void* pState); 1352 typedef IppStatus (CV_STDCALL * ippiPyramidLayerDownFree)(void* pState); 1353 1354 int type = src.type(); 1355 int depth = src.depth(); 1356 ippiPyramidLayerDownInitAlloc pyrInitAllocFunc = 0; 1357 ippiPyramidLayerDown pyrDownFunc = 0; 1358 ippiPyramidLayerDownFree pyrFreeFunc = 0; 1359 1360 if (type == CV_8UC1) 1361 { 1362 pyrInitAllocFunc = (ippiPyramidLayerDownInitAlloc) ippiPyramidLayerDownInitAlloc_8u_C1R; 1363 pyrDownFunc = (ippiPyramidLayerDown) ippiPyramidLayerDown_8u_C1R; 1364 pyrFreeFunc = (ippiPyramidLayerDownFree) ippiPyramidLayerDownFree_8u_C1R; 1365 } 1366 else if (type == CV_8UC3) 1367 { 1368 pyrInitAllocFunc = (ippiPyramidLayerDownInitAlloc) ippiPyramidLayerDownInitAlloc_8u_C3R; 1369 pyrDownFunc = (ippiPyramidLayerDown) ippiPyramidLayerDown_8u_C3R; 1370 pyrFreeFunc = (ippiPyramidLayerDownFree) ippiPyramidLayerDownFree_8u_C3R; 1371 } 1372 else if (type == CV_32FC1) 1373 { 1374 pyrInitAllocFunc = (ippiPyramidLayerDownInitAlloc) ippiPyramidLayerDownInitAlloc_32f_C1R; 1375 pyrDownFunc = (ippiPyramidLayerDown) ippiPyramidLayerDown_32f_C1R; 1376 pyrFreeFunc = (ippiPyramidLayerDownFree) ippiPyramidLayerDownFree_32f_C1R; 1377 } 1378 else if (type == CV_32FC3) 1379 { 1380 pyrInitAllocFunc = (ippiPyramidLayerDownInitAlloc) ippiPyramidLayerDownInitAlloc_32f_C3R; 1381 pyrDownFunc = (ippiPyramidLayerDown) ippiPyramidLayerDown_32f_C3R; 1382 pyrFreeFunc = (ippiPyramidLayerDownFree) ippiPyramidLayerDownFree_32f_C3R; 1383 } 1384 1385 if (pyrInitAllocFunc && pyrDownFunc && pyrFreeFunc) 1386 { 1387 float rate = 2.f; 1388 IppiSize srcRoi = { src.cols, src.rows }; 1389 IppiPyramid *gPyr; 1390 IppStatus ok = ippiPyramidInitAlloc(&gPyr, maxlevel + 1, srcRoi, rate); 1391 1392 Ipp16s iKernel[5] = { 1, 4, 6, 4, 1 }; 1393 Ipp32f fKernel[5] = { 1.f, 4.f, 6.f, 4.f, 1.f }; 1394 void* kernel = depth >= CV_32F ? (void*) fKernel : (void*) iKernel; 1395 1396 if (ok >= 0) ok = pyrInitAllocFunc((void**) &(gPyr->pState), srcRoi, rate, kernel, 5, IPPI_INTER_LINEAR); 1397 if (ok >= 0) 1398 { 1399 gPyr->pImage[0] = src.data; 1400 gPyr->pStep[0] = (int) src.step; 1401 gPyr->pRoi[0] = srcRoi; 1402 for( ; i <= maxlevel; i++ ) 1403 { 1404 IppiSize dstRoi; 1405 ok = ippiGetPyramidDownROI(gPyr->pRoi[i-1], &dstRoi, rate); 1406 Mat& dst = _dst.getMatRef(i); 1407 dst.create(Size(dstRoi.width, dstRoi.height), type); 1408 gPyr->pImage[i] = dst.data; 1409 gPyr->pStep[i] = (int) dst.step; 1410 gPyr->pRoi[i] = dstRoi; 1411 1412 if (ok >= 0) ok = pyrDownFunc(gPyr->pImage[i-1], gPyr->pStep[i-1], gPyr->pRoi[i-1], 1413 gPyr->pImage[i], gPyr->pStep[i], gPyr->pRoi[i], gPyr->pState); 1414 1415 if (ok < 0) 1416 { 1417 setIppErrorStatus(); 1418 break; 1419 } 1420 else 1421 { 1422 CV_IMPL_ADD(CV_IMPL_IPP); 1423 } 1424 } 1425 pyrFreeFunc(gPyr->pState); 1426 } 1427 else 1428 setIppErrorStatus(); 1429 1430 ippiPyramidFree(gPyr); 1431 } 1432 } 1433 } 1434 #endif 1435 for( ; i <= maxlevel; i++ ) 1436 pyrDown( _dst.getMatRef(i-1), _dst.getMatRef(i), Size(), borderType ); 1437 } 1438 1439 CV_IMPL void cvPyrDown( const void* srcarr, void* dstarr, int _filter ) 1440 { 1441 cv::Mat src = cv::cvarrToMat(srcarr), dst = cv::cvarrToMat(dstarr); 1442 1443 CV_Assert( _filter == CV_GAUSSIAN_5x5 && src.type() == dst.type()); 1444 cv::pyrDown( src, dst, dst.size() ); 1445 } 1446 1447 CV_IMPL void cvPyrUp( const void* srcarr, void* dstarr, int _filter ) 1448 { 1449 cv::Mat src = cv::cvarrToMat(srcarr), dst = cv::cvarrToMat(dstarr); 1450 1451 CV_Assert( _filter == CV_GAUSSIAN_5x5 && src.type() == dst.type()); 1452 cv::pyrUp( src, dst, dst.size() ); 1453 } 1454 1455 1456 CV_IMPL void 1457 cvReleasePyramid( CvMat*** _pyramid, int extra_layers ) 1458 { 1459 if( !_pyramid ) 1460 CV_Error( CV_StsNullPtr, "" ); 1461 1462 if( *_pyramid ) 1463 for( int i = 0; i <= extra_layers; i++ ) 1464 cvReleaseMat( &(*_pyramid)[i] ); 1465 1466 cvFree( _pyramid ); 1467 } 1468 1469 1470 CV_IMPL CvMat** 1471 cvCreatePyramid( const CvArr* srcarr, int extra_layers, double rate, 1472 const CvSize* layer_sizes, CvArr* bufarr, 1473 int calc, int filter ) 1474 { 1475 const float eps = 0.1f; 1476 uchar* ptr = 0; 1477 1478 CvMat stub, *src = cvGetMat( srcarr, &stub ); 1479 1480 if( extra_layers < 0 ) 1481 CV_Error( CV_StsOutOfRange, "The number of extra layers must be non negative" ); 1482 1483 int i, layer_step, elem_size = CV_ELEM_SIZE(src->type); 1484 CvSize layer_size, size = cvGetMatSize(src); 1485 1486 if( bufarr ) 1487 { 1488 CvMat bstub, *buf; 1489 int bufsize = 0; 1490 1491 buf = cvGetMat( bufarr, &bstub ); 1492 bufsize = buf->rows*buf->cols*CV_ELEM_SIZE(buf->type); 1493 layer_size = size; 1494 for( i = 1; i <= extra_layers; i++ ) 1495 { 1496 if( !layer_sizes ) 1497 { 1498 layer_size.width = cvRound(layer_size.width*rate+eps); 1499 layer_size.height = cvRound(layer_size.height*rate+eps); 1500 } 1501 else 1502 layer_size = layer_sizes[i-1]; 1503 layer_step = layer_size.width*elem_size; 1504 bufsize -= layer_step*layer_size.height; 1505 } 1506 1507 if( bufsize < 0 ) 1508 CV_Error( CV_StsOutOfRange, "The buffer is too small to fit the pyramid" ); 1509 ptr = buf->data.ptr; 1510 } 1511 1512 CvMat** pyramid = (CvMat**)cvAlloc( (extra_layers+1)*sizeof(pyramid[0]) ); 1513 memset( pyramid, 0, (extra_layers+1)*sizeof(pyramid[0]) ); 1514 1515 pyramid[0] = cvCreateMatHeader( size.height, size.width, src->type ); 1516 cvSetData( pyramid[0], src->data.ptr, src->step ); 1517 layer_size = size; 1518 1519 for( i = 1; i <= extra_layers; i++ ) 1520 { 1521 if( !layer_sizes ) 1522 { 1523 layer_size.width = cvRound(layer_size.width*rate + eps); 1524 layer_size.height = cvRound(layer_size.height*rate + eps); 1525 } 1526 else 1527 layer_size = layer_sizes[i]; 1528 1529 if( bufarr ) 1530 { 1531 pyramid[i] = cvCreateMatHeader( layer_size.height, layer_size.width, src->type ); 1532 layer_step = layer_size.width*elem_size; 1533 cvSetData( pyramid[i], ptr, layer_step ); 1534 ptr += layer_step*layer_size.height; 1535 } 1536 else 1537 pyramid[i] = cvCreateMat( layer_size.height, layer_size.width, src->type ); 1538 1539 if( calc ) 1540 cvPyrDown( pyramid[i-1], pyramid[i], filter ); 1541 //cvResize( pyramid[i-1], pyramid[i], CV_INTER_LINEAR ); 1542 } 1543 1544 return pyramid; 1545 } 1546 1547 /* End of file. */ 1548