1 /*M/////////////////////////////////////////////////////////////////////////////////////// 2 // 3 // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. 4 // 5 // By downloading, copying, installing or using the software you agree to this license. 6 // If you do not agree to this license, do not download, install, 7 // copy or use the software. 8 // 9 // 10 // License Agreement 11 // For Open Source Computer Vision Library 12 // 13 // Copyright (C) 2000, Intel Corporation, all rights reserved. 14 // Copyright (C) 2013, OpenCV Foundation, all rights reserved. 15 // Third party copyrights are property of their respective owners. 16 // 17 // Redistribution and use in source and binary forms, with or without modification, 18 // are permitted provided that the following conditions are met: 19 // 20 // * Redistribution's of source code must retain the above copyright notice, 21 // this list of conditions and the following disclaimer. 22 // 23 // * Redistribution's in binary form must reproduce the above copyright notice, 24 // this list of conditions and the following disclaimer in the documentation 25 // and/or other materials provided with the distribution. 26 // 27 // * The name of the copyright holders may not be used to endorse or promote products 28 // derived from this software without specific prior written permission. 29 // 30 // This software is provided by the copyright holders and contributors "as is" and 31 // any express or implied warranties, including, but not limited to, the implied 32 // warranties of merchantability and fitness for a particular purpose are disclaimed. 33 // In no event shall the Intel Corporation or contributors be liable for any direct, 34 // indirect, incidental, special, exemplary, or consequential damages 35 // (including, but not limited to, procurement of substitute goods or services; 36 // loss of use, data, or profits; or business interruption) however caused 37 // and on any theory of liability, whether in contract, strict liability, 38 // or tort (including negligence or otherwise) arising in any way out of 39 // the use of this software, even if advised of the possibility of such damage. 40 // 41 //M*/ 42 #include "precomp.hpp" 43 #include <float.h> 44 #include <stdio.h> 45 #include "lkpyramid.hpp" 46 #include "opencl_kernels_video.hpp" 47 48 #define CV_DESCALE(x,n) (((x) + (1 << ((n)-1))) >> (n)) 49 50 namespace 51 { 52 static void calcSharrDeriv(const cv::Mat& src, cv::Mat& dst) 53 { 54 using namespace cv; 55 using cv::detail::deriv_type; 56 int rows = src.rows, cols = src.cols, cn = src.channels(), colsn = cols*cn, depth = src.depth(); 57 CV_Assert(depth == CV_8U); 58 dst.create(rows, cols, CV_MAKETYPE(DataType<deriv_type>::depth, cn*2)); 59 60 #ifdef HAVE_TEGRA_OPTIMIZATION 61 if (tegra::useTegra() && tegra::calcSharrDeriv(src, dst)) 62 return; 63 #endif 64 65 int x, y, delta = (int)alignSize((cols + 2)*cn, 16); 66 AutoBuffer<deriv_type> _tempBuf(delta*2 + 64); 67 deriv_type *trow0 = alignPtr(_tempBuf + cn, 16), *trow1 = alignPtr(trow0 + delta, 16); 68 69 #if CV_SSE2 70 __m128i z = _mm_setzero_si128(), c3 = _mm_set1_epi16(3), c10 = _mm_set1_epi16(10); 71 #endif 72 73 #if CV_NEON 74 const uint16x8_t q8 = vdupq_n_u16(3); 75 const uint8x8_t d18 = vdup_n_u8(10); 76 77 const int16x8_t q8i = vdupq_n_s16(3); 78 const int16x8_t q9 = vdupq_n_s16(10); 79 #endif 80 81 for( y = 0; y < rows; y++ ) 82 { 83 const uchar* srow0 = src.ptr<uchar>(y > 0 ? y-1 : rows > 1 ? 1 : 0); 84 const uchar* srow1 = src.ptr<uchar>(y); 85 const uchar* srow2 = src.ptr<uchar>(y < rows-1 ? y+1 : rows > 1 ? rows-2 : 0); 86 deriv_type* drow = dst.ptr<deriv_type>(y); 87 88 // do vertical convolution 89 x = 0; 90 #if CV_SSE2 91 for( ; x <= colsn - 8; x += 8 ) 92 { 93 __m128i s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(srow0 + x)), z); 94 __m128i s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(srow1 + x)), z); 95 __m128i s2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(srow2 + x)), z); 96 __m128i t0 = _mm_add_epi16(_mm_mullo_epi16(_mm_add_epi16(s0, s2), c3), _mm_mullo_epi16(s1, c10)); 97 __m128i t1 = _mm_sub_epi16(s2, s0); 98 _mm_store_si128((__m128i*)(trow0 + x), t0); 99 _mm_store_si128((__m128i*)(trow1 + x), t1); 100 } 101 #endif 102 103 #if CV_NEON 104 for( ; x <= colsn - 8; x += 8) 105 { 106 uint8x8_t d0 = vld1_u8((const uint8_t*)&srow0[x]); 107 uint8x8_t d1 = vld1_u8((const uint8_t*)&srow1[x]); 108 uint8x8_t d2 = vld1_u8((const uint8_t*)&srow2[x]); 109 uint16x8_t q4 = vaddl_u8(d0, d2); 110 uint16x8_t q11 = vsubl_u8(d2, d0); 111 uint16x8_t q5 = vmulq_u16(q4, q8); 112 uint16x8_t q6 = vmull_u8(d1, d18); 113 uint16x8_t q10 = vaddq_u16(q6, q5); 114 vst1q_u16((uint16_t*)&trow0[x], q10); 115 vst1q_u16((uint16_t*)&trow1[x], q11); 116 117 } 118 #endif 119 120 for( ; x < colsn; x++ ) 121 { 122 int t0 = (srow0[x] + srow2[x])*3 + srow1[x]*10; 123 int t1 = srow2[x] - srow0[x]; 124 trow0[x] = (deriv_type)t0; 125 trow1[x] = (deriv_type)t1; 126 } 127 128 // make border 129 int x0 = (cols > 1 ? 1 : 0)*cn, x1 = (cols > 1 ? cols-2 : 0)*cn; 130 for( int k = 0; k < cn; k++ ) 131 { 132 trow0[-cn + k] = trow0[x0 + k]; trow0[colsn + k] = trow0[x1 + k]; 133 trow1[-cn + k] = trow1[x0 + k]; trow1[colsn + k] = trow1[x1 + k]; 134 } 135 136 // do horizontal convolution, interleave the results and store them to dst 137 x = 0; 138 #if CV_SSE2 139 for( ; x <= colsn - 8; x += 8 ) 140 { 141 __m128i s0 = _mm_loadu_si128((const __m128i*)(trow0 + x - cn)); 142 __m128i s1 = _mm_loadu_si128((const __m128i*)(trow0 + x + cn)); 143 __m128i s2 = _mm_loadu_si128((const __m128i*)(trow1 + x - cn)); 144 __m128i s3 = _mm_load_si128((const __m128i*)(trow1 + x)); 145 __m128i s4 = _mm_loadu_si128((const __m128i*)(trow1 + x + cn)); 146 147 __m128i t0 = _mm_sub_epi16(s1, s0); 148 __m128i t1 = _mm_add_epi16(_mm_mullo_epi16(_mm_add_epi16(s2, s4), c3), _mm_mullo_epi16(s3, c10)); 149 __m128i t2 = _mm_unpacklo_epi16(t0, t1); 150 t0 = _mm_unpackhi_epi16(t0, t1); 151 // this can probably be replaced with aligned stores if we aligned dst properly. 152 _mm_storeu_si128((__m128i*)(drow + x*2), t2); 153 _mm_storeu_si128((__m128i*)(drow + x*2 + 8), t0); 154 } 155 #endif 156 157 #if CV_NEON 158 for( ; x <= colsn - 8; x += 8 ) 159 { 160 161 int16x8_t q0 = vld1q_s16((const int16_t*)&trow0[x+cn]); 162 int16x8_t q1 = vld1q_s16((const int16_t*)&trow0[x-cn]); 163 int16x8_t q2 = vld1q_s16((const int16_t*)&trow1[x+cn]); 164 int16x8_t q3 = vld1q_s16((const int16_t*)&trow1[x-cn]); 165 int16x8_t q5 = vsubq_s16(q0, q1); 166 int16x8_t q6 = vaddq_s16(q2, q3); 167 int16x8_t q4 = vld1q_s16((const int16_t*)&trow1[x]); 168 int16x8_t q7 = vmulq_s16(q6, q8i); 169 int16x8_t q10 = vmulq_s16(q4, q9); 170 int16x8_t q11 = vaddq_s16(q7, q10); 171 int16x4_t d22 = vget_low_s16(q11); 172 int16x4_t d23 = vget_high_s16(q11); 173 int16x4_t d11 = vget_high_s16(q5); 174 int16x4_t d10 = vget_low_s16(q5); 175 int16x4x2_t q5x2, q11x2; 176 q5x2.val[0] = d10; q5x2.val[1] = d22; 177 q11x2.val[0] = d11; q11x2.val[1] = d23; 178 vst2_s16((int16_t*)&drow[x*2], q5x2); 179 vst2_s16((int16_t*)&drow[(x*2)+8], q11x2); 180 181 } 182 #endif 183 for( ; x < colsn; x++ ) 184 { 185 deriv_type t0 = (deriv_type)(trow0[x+cn] - trow0[x-cn]); 186 deriv_type t1 = (deriv_type)((trow1[x+cn] + trow1[x-cn])*3 + trow1[x]*10); 187 drow[x*2] = t0; drow[x*2+1] = t1; 188 } 189 } 190 } 191 192 }//namespace 193 194 cv::detail::LKTrackerInvoker::LKTrackerInvoker( 195 const Mat& _prevImg, const Mat& _prevDeriv, const Mat& _nextImg, 196 const Point2f* _prevPts, Point2f* _nextPts, 197 uchar* _status, float* _err, 198 Size _winSize, TermCriteria _criteria, 199 int _level, int _maxLevel, int _flags, float _minEigThreshold ) 200 { 201 prevImg = &_prevImg; 202 prevDeriv = &_prevDeriv; 203 nextImg = &_nextImg; 204 prevPts = _prevPts; 205 nextPts = _nextPts; 206 status = _status; 207 err = _err; 208 winSize = _winSize; 209 criteria = _criteria; 210 level = _level; 211 maxLevel = _maxLevel; 212 flags = _flags; 213 minEigThreshold = _minEigThreshold; 214 } 215 216 #if defined __arm__ && !CV_NEON 217 typedef int64 acctype; 218 typedef int itemtype; 219 #else 220 typedef float acctype; 221 typedef float itemtype; 222 #endif 223 224 void cv::detail::LKTrackerInvoker::operator()(const Range& range) const 225 { 226 Point2f halfWin((winSize.width-1)*0.5f, (winSize.height-1)*0.5f); 227 const Mat& I = *prevImg; 228 const Mat& J = *nextImg; 229 const Mat& derivI = *prevDeriv; 230 231 int j, cn = I.channels(), cn2 = cn*2; 232 cv::AutoBuffer<deriv_type> _buf(winSize.area()*(cn + cn2)); 233 int derivDepth = DataType<deriv_type>::depth; 234 235 Mat IWinBuf(winSize, CV_MAKETYPE(derivDepth, cn), (deriv_type*)_buf); 236 Mat derivIWinBuf(winSize, CV_MAKETYPE(derivDepth, cn2), (deriv_type*)_buf + winSize.area()*cn); 237 238 for( int ptidx = range.start; ptidx < range.end; ptidx++ ) 239 { 240 Point2f prevPt = prevPts[ptidx]*(float)(1./(1 << level)); 241 Point2f nextPt; 242 if( level == maxLevel ) 243 { 244 if( flags & OPTFLOW_USE_INITIAL_FLOW ) 245 nextPt = nextPts[ptidx]*(float)(1./(1 << level)); 246 else 247 nextPt = prevPt; 248 } 249 else 250 nextPt = nextPts[ptidx]*2.f; 251 nextPts[ptidx] = nextPt; 252 253 Point2i iprevPt, inextPt; 254 prevPt -= halfWin; 255 iprevPt.x = cvFloor(prevPt.x); 256 iprevPt.y = cvFloor(prevPt.y); 257 258 if( iprevPt.x < -winSize.width || iprevPt.x >= derivI.cols || 259 iprevPt.y < -winSize.height || iprevPt.y >= derivI.rows ) 260 { 261 if( level == 0 ) 262 { 263 if( status ) 264 status[ptidx] = false; 265 if( err ) 266 err[ptidx] = 0; 267 } 268 continue; 269 } 270 271 float a = prevPt.x - iprevPt.x; 272 float b = prevPt.y - iprevPt.y; 273 const int W_BITS = 14, W_BITS1 = 14; 274 const float FLT_SCALE = 1.f/(1 << 20); 275 int iw00 = cvRound((1.f - a)*(1.f - b)*(1 << W_BITS)); 276 int iw01 = cvRound(a*(1.f - b)*(1 << W_BITS)); 277 int iw10 = cvRound((1.f - a)*b*(1 << W_BITS)); 278 int iw11 = (1 << W_BITS) - iw00 - iw01 - iw10; 279 280 int dstep = (int)(derivI.step/derivI.elemSize1()); 281 int stepI = (int)(I.step/I.elemSize1()); 282 int stepJ = (int)(J.step/J.elemSize1()); 283 acctype iA11 = 0, iA12 = 0, iA22 = 0; 284 float A11, A12, A22; 285 286 #if CV_SSE2 287 __m128i qw0 = _mm_set1_epi32(iw00 + (iw01 << 16)); 288 __m128i qw1 = _mm_set1_epi32(iw10 + (iw11 << 16)); 289 __m128i z = _mm_setzero_si128(); 290 __m128i qdelta_d = _mm_set1_epi32(1 << (W_BITS1-1)); 291 __m128i qdelta = _mm_set1_epi32(1 << (W_BITS1-5-1)); 292 __m128 qA11 = _mm_setzero_ps(), qA12 = _mm_setzero_ps(), qA22 = _mm_setzero_ps(); 293 #endif 294 295 #if CV_NEON 296 297 int CV_DECL_ALIGNED(16) nA11[] = {0, 0, 0, 0}, nA12[] = {0, 0, 0, 0}, nA22[] = {0, 0, 0, 0}; 298 const int shifter1 = -(W_BITS - 5); //negative so it shifts right 299 const int shifter2 = -(W_BITS); 300 301 const int16x4_t d26 = vdup_n_s16((int16_t)iw00); 302 const int16x4_t d27 = vdup_n_s16((int16_t)iw01); 303 const int16x4_t d28 = vdup_n_s16((int16_t)iw10); 304 const int16x4_t d29 = vdup_n_s16((int16_t)iw11); 305 const int32x4_t q11 = vdupq_n_s32((int32_t)shifter1); 306 const int32x4_t q12 = vdupq_n_s32((int32_t)shifter2); 307 308 #endif 309 310 // extract the patch from the first image, compute covariation matrix of derivatives 311 int x, y; 312 for( y = 0; y < winSize.height; y++ ) 313 { 314 const uchar* src = I.ptr() + (y + iprevPt.y)*stepI + iprevPt.x*cn; 315 const deriv_type* dsrc = derivI.ptr<deriv_type>() + (y + iprevPt.y)*dstep + iprevPt.x*cn2; 316 317 deriv_type* Iptr = IWinBuf.ptr<deriv_type>(y); 318 deriv_type* dIptr = derivIWinBuf.ptr<deriv_type>(y); 319 320 x = 0; 321 322 #if CV_SSE2 323 for( ; x <= winSize.width*cn - 4; x += 4, dsrc += 4*2, dIptr += 4*2 ) 324 { 325 __m128i v00, v01, v10, v11, t0, t1; 326 327 v00 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x)), z); 328 v01 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x + cn)), z); 329 v10 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x + stepI)), z); 330 v11 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x + stepI + cn)), z); 331 332 t0 = _mm_add_epi32(_mm_madd_epi16(_mm_unpacklo_epi16(v00, v01), qw0), 333 _mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1)); 334 t0 = _mm_srai_epi32(_mm_add_epi32(t0, qdelta), W_BITS1-5); 335 _mm_storel_epi64((__m128i*)(Iptr + x), _mm_packs_epi32(t0,t0)); 336 337 v00 = _mm_loadu_si128((const __m128i*)(dsrc)); 338 v01 = _mm_loadu_si128((const __m128i*)(dsrc + cn2)); 339 v10 = _mm_loadu_si128((const __m128i*)(dsrc + dstep)); 340 v11 = _mm_loadu_si128((const __m128i*)(dsrc + dstep + cn2)); 341 342 t0 = _mm_add_epi32(_mm_madd_epi16(_mm_unpacklo_epi16(v00, v01), qw0), 343 _mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1)); 344 t1 = _mm_add_epi32(_mm_madd_epi16(_mm_unpackhi_epi16(v00, v01), qw0), 345 _mm_madd_epi16(_mm_unpackhi_epi16(v10, v11), qw1)); 346 t0 = _mm_srai_epi32(_mm_add_epi32(t0, qdelta_d), W_BITS1); 347 t1 = _mm_srai_epi32(_mm_add_epi32(t1, qdelta_d), W_BITS1); 348 v00 = _mm_packs_epi32(t0, t1); // Ix0 Iy0 Ix1 Iy1 ... 349 350 _mm_storeu_si128((__m128i*)dIptr, v00); 351 t0 = _mm_srai_epi32(v00, 16); // Iy0 Iy1 Iy2 Iy3 352 t1 = _mm_srai_epi32(_mm_slli_epi32(v00, 16), 16); // Ix0 Ix1 Ix2 Ix3 353 354 __m128 fy = _mm_cvtepi32_ps(t0); 355 __m128 fx = _mm_cvtepi32_ps(t1); 356 357 qA22 = _mm_add_ps(qA22, _mm_mul_ps(fy, fy)); 358 qA12 = _mm_add_ps(qA12, _mm_mul_ps(fx, fy)); 359 qA11 = _mm_add_ps(qA11, _mm_mul_ps(fx, fx)); 360 } 361 #endif 362 363 #if CV_NEON 364 for( ; x <= winSize.width*cn - 4; x += 4, dsrc += 4*2, dIptr += 4*2 ) 365 { 366 367 uint8x8_t d0 = vld1_u8(&src[x]); 368 uint8x8_t d2 = vld1_u8(&src[x+cn]); 369 uint16x8_t q0 = vmovl_u8(d0); 370 uint16x8_t q1 = vmovl_u8(d2); 371 372 int32x4_t q5 = vmull_s16(vget_low_s16(vreinterpretq_s16_u16(q0)), d26); 373 int32x4_t q6 = vmull_s16(vget_low_s16(vreinterpretq_s16_u16(q1)), d27); 374 375 uint8x8_t d4 = vld1_u8(&src[x + stepI]); 376 uint8x8_t d6 = vld1_u8(&src[x + stepI + cn]); 377 uint16x8_t q2 = vmovl_u8(d4); 378 uint16x8_t q3 = vmovl_u8(d6); 379 380 int32x4_t q7 = vmull_s16(vget_low_s16(vreinterpretq_s16_u16(q2)), d28); 381 int32x4_t q8 = vmull_s16(vget_low_s16(vreinterpretq_s16_u16(q3)), d29); 382 383 q5 = vaddq_s32(q5, q6); 384 q7 = vaddq_s32(q7, q8); 385 q5 = vaddq_s32(q5, q7); 386 387 int16x4x2_t d0d1 = vld2_s16(dsrc); 388 int16x4x2_t d2d3 = vld2_s16(&dsrc[cn2]); 389 390 q5 = vqrshlq_s32(q5, q11); 391 392 int32x4_t q4 = vmull_s16(d0d1.val[0], d26); 393 q6 = vmull_s16(d0d1.val[1], d26); 394 395 int16x4_t nd0 = vmovn_s32(q5); 396 397 q7 = vmull_s16(d2d3.val[0], d27); 398 q8 = vmull_s16(d2d3.val[1], d27); 399 400 vst1_s16(&Iptr[x], nd0); 401 402 int16x4x2_t d4d5 = vld2_s16(&dsrc[dstep]); 403 int16x4x2_t d6d7 = vld2_s16(&dsrc[dstep+cn2]); 404 405 q4 = vaddq_s32(q4, q7); 406 q6 = vaddq_s32(q6, q8); 407 408 q7 = vmull_s16(d4d5.val[0], d28); 409 int32x4_t nq0 = vmull_s16(d4d5.val[1], d28); 410 q8 = vmull_s16(d6d7.val[0], d29); 411 int32x4_t q15 = vmull_s16(d6d7.val[1], d29); 412 413 q7 = vaddq_s32(q7, q8); 414 nq0 = vaddq_s32(nq0, q15); 415 416 q4 = vaddq_s32(q4, q7); 417 q6 = vaddq_s32(q6, nq0); 418 419 int32x4_t nq1 = vld1q_s32(nA12); 420 int32x4_t nq2 = vld1q_s32(nA22); 421 nq0 = vld1q_s32(nA11); 422 423 q4 = vqrshlq_s32(q4, q12); 424 q6 = vqrshlq_s32(q6, q12); 425 426 q7 = vmulq_s32(q4, q4); 427 q8 = vmulq_s32(q4, q6); 428 q15 = vmulq_s32(q6, q6); 429 430 nq0 = vaddq_s32(nq0, q7); 431 nq1 = vaddq_s32(nq1, q8); 432 nq2 = vaddq_s32(nq2, q15); 433 434 vst1q_s32(nA11, nq0); 435 vst1q_s32(nA12, nq1); 436 vst1q_s32(nA22, nq2); 437 438 int16x4_t d8 = vmovn_s32(q4); 439 int16x4_t d12 = vmovn_s32(q6); 440 441 int16x4x2_t d8d12; 442 d8d12.val[0] = d8; d8d12.val[1] = d12; 443 vst2_s16(dIptr, d8d12); 444 } 445 #endif 446 447 for( ; x < winSize.width*cn; x++, dsrc += 2, dIptr += 2 ) 448 { 449 int ival = CV_DESCALE(src[x]*iw00 + src[x+cn]*iw01 + 450 src[x+stepI]*iw10 + src[x+stepI+cn]*iw11, W_BITS1-5); 451 int ixval = CV_DESCALE(dsrc[0]*iw00 + dsrc[cn2]*iw01 + 452 dsrc[dstep]*iw10 + dsrc[dstep+cn2]*iw11, W_BITS1); 453 int iyval = CV_DESCALE(dsrc[1]*iw00 + dsrc[cn2+1]*iw01 + dsrc[dstep+1]*iw10 + 454 dsrc[dstep+cn2+1]*iw11, W_BITS1); 455 456 Iptr[x] = (short)ival; 457 dIptr[0] = (short)ixval; 458 dIptr[1] = (short)iyval; 459 460 iA11 += (itemtype)(ixval*ixval); 461 iA12 += (itemtype)(ixval*iyval); 462 iA22 += (itemtype)(iyval*iyval); 463 } 464 } 465 466 #if CV_SSE2 467 float CV_DECL_ALIGNED(16) A11buf[4], A12buf[4], A22buf[4]; 468 _mm_store_ps(A11buf, qA11); 469 _mm_store_ps(A12buf, qA12); 470 _mm_store_ps(A22buf, qA22); 471 iA11 += A11buf[0] + A11buf[1] + A11buf[2] + A11buf[3]; 472 iA12 += A12buf[0] + A12buf[1] + A12buf[2] + A12buf[3]; 473 iA22 += A22buf[0] + A22buf[1] + A22buf[2] + A22buf[3]; 474 #endif 475 476 #if CV_NEON 477 iA11 += (float)(nA11[0] + nA11[1] + nA11[2] + nA11[3]); 478 iA12 += (float)(nA12[0] + nA12[1] + nA12[2] + nA12[3]); 479 iA22 += (float)(nA22[0] + nA22[1] + nA22[2] + nA22[3]); 480 #endif 481 482 A11 = iA11*FLT_SCALE; 483 A12 = iA12*FLT_SCALE; 484 A22 = iA22*FLT_SCALE; 485 486 float D = A11*A22 - A12*A12; 487 float minEig = (A22 + A11 - std::sqrt((A11-A22)*(A11-A22) + 488 4.f*A12*A12))/(2*winSize.width*winSize.height); 489 490 if( err && (flags & OPTFLOW_LK_GET_MIN_EIGENVALS) != 0 ) 491 err[ptidx] = (float)minEig; 492 493 if( minEig < minEigThreshold || D < FLT_EPSILON ) 494 { 495 if( level == 0 && status ) 496 status[ptidx] = false; 497 continue; 498 } 499 500 D = 1.f/D; 501 502 nextPt -= halfWin; 503 Point2f prevDelta; 504 505 for( j = 0; j < criteria.maxCount; j++ ) 506 { 507 inextPt.x = cvFloor(nextPt.x); 508 inextPt.y = cvFloor(nextPt.y); 509 510 if( inextPt.x < -winSize.width || inextPt.x >= J.cols || 511 inextPt.y < -winSize.height || inextPt.y >= J.rows ) 512 { 513 if( level == 0 && status ) 514 status[ptidx] = false; 515 break; 516 } 517 518 a = nextPt.x - inextPt.x; 519 b = nextPt.y - inextPt.y; 520 iw00 = cvRound((1.f - a)*(1.f - b)*(1 << W_BITS)); 521 iw01 = cvRound(a*(1.f - b)*(1 << W_BITS)); 522 iw10 = cvRound((1.f - a)*b*(1 << W_BITS)); 523 iw11 = (1 << W_BITS) - iw00 - iw01 - iw10; 524 acctype ib1 = 0, ib2 = 0; 525 float b1, b2; 526 #if CV_SSE2 527 qw0 = _mm_set1_epi32(iw00 + (iw01 << 16)); 528 qw1 = _mm_set1_epi32(iw10 + (iw11 << 16)); 529 __m128 qb0 = _mm_setzero_ps(), qb1 = _mm_setzero_ps(); 530 #endif 531 532 #if CV_NEON 533 int CV_DECL_ALIGNED(16) nB1[] = {0,0,0,0}, nB2[] = {0,0,0,0}; 534 535 const int16x4_t d26_2 = vdup_n_s16((int16_t)iw00); 536 const int16x4_t d27_2 = vdup_n_s16((int16_t)iw01); 537 const int16x4_t d28_2 = vdup_n_s16((int16_t)iw10); 538 const int16x4_t d29_2 = vdup_n_s16((int16_t)iw11); 539 540 #endif 541 542 for( y = 0; y < winSize.height; y++ ) 543 { 544 const uchar* Jptr = J.ptr() + (y + inextPt.y)*stepJ + inextPt.x*cn; 545 const deriv_type* Iptr = IWinBuf.ptr<deriv_type>(y); 546 const deriv_type* dIptr = derivIWinBuf.ptr<deriv_type>(y); 547 548 x = 0; 549 550 #if CV_SSE2 551 for( ; x <= winSize.width*cn - 8; x += 8, dIptr += 8*2 ) 552 { 553 __m128i diff0 = _mm_loadu_si128((const __m128i*)(Iptr + x)), diff1; 554 __m128i v00 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x)), z); 555 __m128i v01 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + cn)), z); 556 __m128i v10 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + stepJ)), z); 557 __m128i v11 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + stepJ + cn)), z); 558 559 __m128i t0 = _mm_add_epi32(_mm_madd_epi16(_mm_unpacklo_epi16(v00, v01), qw0), 560 _mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1)); 561 __m128i t1 = _mm_add_epi32(_mm_madd_epi16(_mm_unpackhi_epi16(v00, v01), qw0), 562 _mm_madd_epi16(_mm_unpackhi_epi16(v10, v11), qw1)); 563 t0 = _mm_srai_epi32(_mm_add_epi32(t0, qdelta), W_BITS1-5); 564 t1 = _mm_srai_epi32(_mm_add_epi32(t1, qdelta), W_BITS1-5); 565 diff0 = _mm_subs_epi16(_mm_packs_epi32(t0, t1), diff0); 566 diff1 = _mm_unpackhi_epi16(diff0, diff0); 567 diff0 = _mm_unpacklo_epi16(diff0, diff0); // It0 It0 It1 It1 ... 568 v00 = _mm_loadu_si128((const __m128i*)(dIptr)); // Ix0 Iy0 Ix1 Iy1 ... 569 v01 = _mm_loadu_si128((const __m128i*)(dIptr + 8)); 570 v10 = _mm_mullo_epi16(v00, diff0); 571 v11 = _mm_mulhi_epi16(v00, diff0); 572 v00 = _mm_unpacklo_epi16(v10, v11); 573 v10 = _mm_unpackhi_epi16(v10, v11); 574 qb0 = _mm_add_ps(qb0, _mm_cvtepi32_ps(v00)); 575 qb1 = _mm_add_ps(qb1, _mm_cvtepi32_ps(v10)); 576 v10 = _mm_mullo_epi16(v01, diff1); 577 v11 = _mm_mulhi_epi16(v01, diff1); 578 v00 = _mm_unpacklo_epi16(v10, v11); 579 v10 = _mm_unpackhi_epi16(v10, v11); 580 qb0 = _mm_add_ps(qb0, _mm_cvtepi32_ps(v00)); 581 qb1 = _mm_add_ps(qb1, _mm_cvtepi32_ps(v10)); 582 } 583 #endif 584 585 #if CV_NEON 586 for( ; x <= winSize.width*cn - 8; x += 8, dIptr += 8*2 ) 587 { 588 589 uint8x8_t d0 = vld1_u8(&Jptr[x]); 590 uint8x8_t d2 = vld1_u8(&Jptr[x+cn]); 591 uint8x8_t d4 = vld1_u8(&Jptr[x+stepJ]); 592 uint8x8_t d6 = vld1_u8(&Jptr[x+stepJ+cn]); 593 594 uint16x8_t q0 = vmovl_u8(d0); 595 uint16x8_t q1 = vmovl_u8(d2); 596 uint16x8_t q2 = vmovl_u8(d4); 597 uint16x8_t q3 = vmovl_u8(d6); 598 599 int32x4_t nq4 = vmull_s16(vget_low_s16(vreinterpretq_s16_u16(q0)), d26_2); 600 int32x4_t nq5 = vmull_s16(vget_high_s16(vreinterpretq_s16_u16(q0)), d26_2); 601 602 int32x4_t nq6 = vmull_s16(vget_low_s16(vreinterpretq_s16_u16(q1)), d27_2); 603 int32x4_t nq7 = vmull_s16(vget_high_s16(vreinterpretq_s16_u16(q1)), d27_2); 604 605 int32x4_t nq8 = vmull_s16(vget_low_s16(vreinterpretq_s16_u16(q2)), d28_2); 606 int32x4_t nq9 = vmull_s16(vget_high_s16(vreinterpretq_s16_u16(q2)), d28_2); 607 608 int32x4_t nq10 = vmull_s16(vget_low_s16(vreinterpretq_s16_u16(q3)), d29_2); 609 int32x4_t nq11 = vmull_s16(vget_high_s16(vreinterpretq_s16_u16(q3)), d29_2); 610 611 nq4 = vaddq_s32(nq4, nq6); 612 nq5 = vaddq_s32(nq5, nq7); 613 nq8 = vaddq_s32(nq8, nq10); 614 nq9 = vaddq_s32(nq9, nq11); 615 616 int16x8_t q6 = vld1q_s16(&Iptr[x]); 617 618 nq4 = vaddq_s32(nq4, nq8); 619 nq5 = vaddq_s32(nq5, nq9); 620 621 nq8 = vmovl_s16(vget_high_s16(q6)); 622 nq6 = vmovl_s16(vget_low_s16(q6)); 623 624 nq4 = vqrshlq_s32(nq4, q11); 625 nq5 = vqrshlq_s32(nq5, q11); 626 627 int16x8x2_t q0q1 = vld2q_s16(dIptr); 628 nq11 = vld1q_s32(nB1); 629 int32x4_t nq15 = vld1q_s32(nB2); 630 631 nq4 = vsubq_s32(nq4, nq6); 632 nq5 = vsubq_s32(nq5, nq8); 633 634 int32x4_t nq2 = vmovl_s16(vget_low_s16(q0q1.val[0])); 635 int32x4_t nq3 = vmovl_s16(vget_high_s16(q0q1.val[0])); 636 637 nq7 = vmovl_s16(vget_low_s16(q0q1.val[1])); 638 nq8 = vmovl_s16(vget_high_s16(q0q1.val[1])); 639 640 nq9 = vmulq_s32(nq4, nq2); 641 nq10 = vmulq_s32(nq5, nq3); 642 643 nq4 = vmulq_s32(nq4, nq7); 644 nq5 = vmulq_s32(nq5, nq8); 645 646 nq9 = vaddq_s32(nq9, nq10); 647 nq4 = vaddq_s32(nq4, nq5); 648 649 nq11 = vaddq_s32(nq11, nq9); 650 nq15 = vaddq_s32(nq15, nq4); 651 652 vst1q_s32(nB1, nq11); 653 vst1q_s32(nB2, nq15); 654 } 655 #endif 656 657 for( ; x < winSize.width*cn; x++, dIptr += 2 ) 658 { 659 int diff = CV_DESCALE(Jptr[x]*iw00 + Jptr[x+cn]*iw01 + 660 Jptr[x+stepJ]*iw10 + Jptr[x+stepJ+cn]*iw11, 661 W_BITS1-5) - Iptr[x]; 662 ib1 += (itemtype)(diff*dIptr[0]); 663 ib2 += (itemtype)(diff*dIptr[1]); 664 } 665 } 666 667 #if CV_SSE2 668 float CV_DECL_ALIGNED(16) bbuf[4]; 669 _mm_store_ps(bbuf, _mm_add_ps(qb0, qb1)); 670 ib1 += bbuf[0] + bbuf[2]; 671 ib2 += bbuf[1] + bbuf[3]; 672 #endif 673 674 #if CV_NEON 675 676 ib1 += (float)(nB1[0] + nB1[1] + nB1[2] + nB1[3]); 677 ib2 += (float)(nB2[0] + nB2[1] + nB2[2] + nB2[3]); 678 #endif 679 680 b1 = ib1*FLT_SCALE; 681 b2 = ib2*FLT_SCALE; 682 683 Point2f delta( (float)((A12*b2 - A22*b1) * D), 684 (float)((A12*b1 - A11*b2) * D)); 685 //delta = -delta; 686 687 nextPt += delta; 688 nextPts[ptidx] = nextPt + halfWin; 689 690 if( delta.ddot(delta) <= criteria.epsilon ) 691 break; 692 693 if( j > 0 && std::abs(delta.x + prevDelta.x) < 0.01 && 694 std::abs(delta.y + prevDelta.y) < 0.01 ) 695 { 696 nextPts[ptidx] -= delta*0.5f; 697 break; 698 } 699 prevDelta = delta; 700 } 701 702 if( status[ptidx] && err && level == 0 && (flags & OPTFLOW_LK_GET_MIN_EIGENVALS) == 0 ) 703 { 704 Point2f nextPoint = nextPts[ptidx] - halfWin; 705 Point inextPoint; 706 707 inextPoint.x = cvFloor(nextPoint.x); 708 inextPoint.y = cvFloor(nextPoint.y); 709 710 if( inextPoint.x < -winSize.width || inextPoint.x >= J.cols || 711 inextPoint.y < -winSize.height || inextPoint.y >= J.rows ) 712 { 713 if( status ) 714 status[ptidx] = false; 715 continue; 716 } 717 718 float aa = nextPoint.x - inextPoint.x; 719 float bb = nextPoint.y - inextPoint.y; 720 iw00 = cvRound((1.f - aa)*(1.f - bb)*(1 << W_BITS)); 721 iw01 = cvRound(aa*(1.f - bb)*(1 << W_BITS)); 722 iw10 = cvRound((1.f - aa)*bb*(1 << W_BITS)); 723 iw11 = (1 << W_BITS) - iw00 - iw01 - iw10; 724 float errval = 0.f; 725 726 for( y = 0; y < winSize.height; y++ ) 727 { 728 const uchar* Jptr = J.ptr() + (y + inextPoint.y)*stepJ + inextPoint.x*cn; 729 const deriv_type* Iptr = IWinBuf.ptr<deriv_type>(y); 730 731 for( x = 0; x < winSize.width*cn; x++ ) 732 { 733 int diff = CV_DESCALE(Jptr[x]*iw00 + Jptr[x+cn]*iw01 + 734 Jptr[x+stepJ]*iw10 + Jptr[x+stepJ+cn]*iw11, 735 W_BITS1-5) - Iptr[x]; 736 errval += std::abs((float)diff); 737 } 738 } 739 err[ptidx] = errval * 1.f/(32*winSize.width*cn*winSize.height); 740 } 741 } 742 } 743 744 int cv::buildOpticalFlowPyramid(InputArray _img, OutputArrayOfArrays pyramid, Size winSize, int maxLevel, bool withDerivatives, 745 int pyrBorder, int derivBorder, bool tryReuseInputImage) 746 { 747 Mat img = _img.getMat(); 748 CV_Assert(img.depth() == CV_8U && winSize.width > 2 && winSize.height > 2 ); 749 int pyrstep = withDerivatives ? 2 : 1; 750 751 pyramid.create(1, (maxLevel + 1) * pyrstep, 0 /*type*/, -1, true, 0); 752 753 int derivType = CV_MAKETYPE(DataType<cv::detail::deriv_type>::depth, img.channels() * 2); 754 755 //level 0 756 bool lvl0IsSet = false; 757 if(tryReuseInputImage && img.isSubmatrix() && (pyrBorder & BORDER_ISOLATED) == 0) 758 { 759 Size wholeSize; 760 Point ofs; 761 img.locateROI(wholeSize, ofs); 762 if (ofs.x >= winSize.width && ofs.y >= winSize.height 763 && ofs.x + img.cols + winSize.width <= wholeSize.width 764 && ofs.y + img.rows + winSize.height <= wholeSize.height) 765 { 766 pyramid.getMatRef(0) = img; 767 lvl0IsSet = true; 768 } 769 } 770 771 if(!lvl0IsSet) 772 { 773 Mat& temp = pyramid.getMatRef(0); 774 775 if(!temp.empty()) 776 temp.adjustROI(winSize.height, winSize.height, winSize.width, winSize.width); 777 if(temp.type() != img.type() || temp.cols != winSize.width*2 + img.cols || temp.rows != winSize.height * 2 + img.rows) 778 temp.create(img.rows + winSize.height*2, img.cols + winSize.width*2, img.type()); 779 780 if(pyrBorder == BORDER_TRANSPARENT) 781 img.copyTo(temp(Rect(winSize.width, winSize.height, img.cols, img.rows))); 782 else 783 copyMakeBorder(img, temp, winSize.height, winSize.height, winSize.width, winSize.width, pyrBorder); 784 temp.adjustROI(-winSize.height, -winSize.height, -winSize.width, -winSize.width); 785 } 786 787 Size sz = img.size(); 788 Mat prevLevel = pyramid.getMatRef(0); 789 Mat thisLevel = prevLevel; 790 791 for(int level = 0; level <= maxLevel; ++level) 792 { 793 if (level != 0) 794 { 795 Mat& temp = pyramid.getMatRef(level * pyrstep); 796 797 if(!temp.empty()) 798 temp.adjustROI(winSize.height, winSize.height, winSize.width, winSize.width); 799 if(temp.type() != img.type() || temp.cols != winSize.width*2 + sz.width || temp.rows != winSize.height * 2 + sz.height) 800 temp.create(sz.height + winSize.height*2, sz.width + winSize.width*2, img.type()); 801 802 thisLevel = temp(Rect(winSize.width, winSize.height, sz.width, sz.height)); 803 pyrDown(prevLevel, thisLevel, sz); 804 805 if(pyrBorder != BORDER_TRANSPARENT) 806 copyMakeBorder(thisLevel, temp, winSize.height, winSize.height, winSize.width, winSize.width, pyrBorder|BORDER_ISOLATED); 807 temp.adjustROI(-winSize.height, -winSize.height, -winSize.width, -winSize.width); 808 } 809 810 if(withDerivatives) 811 { 812 Mat& deriv = pyramid.getMatRef(level * pyrstep + 1); 813 814 if(!deriv.empty()) 815 deriv.adjustROI(winSize.height, winSize.height, winSize.width, winSize.width); 816 if(deriv.type() != derivType || deriv.cols != winSize.width*2 + sz.width || deriv.rows != winSize.height * 2 + sz.height) 817 deriv.create(sz.height + winSize.height*2, sz.width + winSize.width*2, derivType); 818 819 Mat derivI = deriv(Rect(winSize.width, winSize.height, sz.width, sz.height)); 820 calcSharrDeriv(thisLevel, derivI); 821 822 if(derivBorder != BORDER_TRANSPARENT) 823 copyMakeBorder(derivI, deriv, winSize.height, winSize.height, winSize.width, winSize.width, derivBorder|BORDER_ISOLATED); 824 deriv.adjustROI(-winSize.height, -winSize.height, -winSize.width, -winSize.width); 825 } 826 827 sz = Size((sz.width+1)/2, (sz.height+1)/2); 828 if( sz.width <= winSize.width || sz.height <= winSize.height ) 829 { 830 pyramid.create(1, (level + 1) * pyrstep, 0 /*type*/, -1, true, 0);//check this 831 return level; 832 } 833 834 prevLevel = thisLevel; 835 } 836 837 return maxLevel; 838 } 839 840 namespace cv 841 { 842 class PyrLKOpticalFlow 843 { 844 struct dim3 845 { 846 unsigned int x, y, z; 847 dim3() : x(0), y(0), z(0) { } 848 }; 849 public: 850 PyrLKOpticalFlow() 851 { 852 winSize = Size(21, 21); 853 maxLevel = 3; 854 iters = 30; 855 derivLambda = 0.5; 856 useInitialFlow = false; 857 858 waveSize = 0; 859 } 860 861 bool checkParam() 862 { 863 iters = std::min(std::max(iters, 0), 100); 864 865 derivLambda = std::min(std::max(derivLambda, 0.0), 1.0); 866 if (derivLambda < 0) 867 return false; 868 if (maxLevel < 0 || winSize.width <= 2 || winSize.height <= 2) 869 return false; 870 calcPatchSize(); 871 if (patch.x <= 0 || patch.x >= 6 || patch.y <= 0 || patch.y >= 6) 872 return false; 873 if (!initWaveSize()) 874 return false; 875 return true; 876 } 877 878 bool sparse(const UMat &prevImg, const UMat &nextImg, const UMat &prevPts, UMat &nextPts, UMat &status, UMat &err) 879 { 880 if (!checkParam()) 881 return false; 882 883 UMat temp1 = (useInitialFlow ? nextPts : prevPts).reshape(1); 884 UMat temp2 = nextPts.reshape(1); 885 multiply(1.0f / (1 << maxLevel) /2.0f, temp1, temp2); 886 887 status.setTo(Scalar::all(1)); 888 889 // build the image pyramids. 890 std::vector<UMat> prevPyr; prevPyr.resize(maxLevel + 1); 891 std::vector<UMat> nextPyr; nextPyr.resize(maxLevel + 1); 892 893 // allocate buffers with aligned pitch to be able to use cl_khr_image2d_from_buffer extention 894 // This is the required pitch alignment in pixels 895 int pitchAlign = (int)ocl::Device::getDefault().imagePitchAlignment(); 896 if (pitchAlign>0) 897 { 898 prevPyr[0] = UMat(prevImg.rows,(prevImg.cols+pitchAlign-1)&(-pitchAlign),CV_32FC1).colRange(0,prevImg.cols); 899 nextPyr[0] = UMat(nextImg.rows,(nextImg.cols+pitchAlign-1)&(-pitchAlign),CV_32FC1).colRange(0,nextImg.cols); 900 for (int level = 1; level <= maxLevel; ++level) 901 { 902 int cols,rows; 903 // allocate buffers with aligned pitch to be able to use image on buffer extention 904 cols = (prevPyr[level - 1].cols+1)/2; 905 rows = (prevPyr[level - 1].rows+1)/2; 906 prevPyr[level] = UMat(rows,(cols+pitchAlign-1)&(-pitchAlign),prevPyr[level-1].type()).colRange(0,cols); 907 cols = (nextPyr[level - 1].cols+1)/2; 908 rows = (nextPyr[level - 1].rows+1)/2; 909 nextPyr[level] = UMat(rows,(cols+pitchAlign-1)&(-pitchAlign),nextPyr[level-1].type()).colRange(0,cols); 910 } 911 } 912 913 prevImg.convertTo(prevPyr[0], CV_32F); 914 nextImg.convertTo(nextPyr[0], CV_32F); 915 916 for (int level = 1; level <= maxLevel; ++level) 917 { 918 pyrDown(prevPyr[level - 1], prevPyr[level]); 919 pyrDown(nextPyr[level - 1], nextPyr[level]); 920 } 921 922 // dI/dx ~ Ix, dI/dy ~ Iy 923 for (int level = maxLevel; level >= 0; level--) 924 { 925 if (!lkSparse_run(prevPyr[level], nextPyr[level], prevPts, 926 nextPts, status, err, 927 prevPts.cols, level)) 928 return false; 929 } 930 return true; 931 } 932 933 Size winSize; 934 int maxLevel; 935 int iters; 936 double derivLambda; 937 bool useInitialFlow; 938 939 private: 940 int waveSize; 941 bool initWaveSize() 942 { 943 waveSize = 1; 944 if (isDeviceCPU()) 945 return true; 946 947 ocl::Kernel kernel; 948 if (!kernel.create("lkSparse", cv::ocl::video::pyrlk_oclsrc, "")) 949 return false; 950 waveSize = (int)kernel.preferedWorkGroupSizeMultiple(); 951 return true; 952 } 953 dim3 patch; 954 void calcPatchSize() 955 { 956 dim3 block; 957 958 if (winSize.width > 32 && winSize.width > 2 * winSize.height) 959 { 960 block.x = 32; 961 block.y = 8; 962 } 963 else 964 { 965 block.x = 16; 966 block.y = 16; 967 } 968 969 patch.x = (winSize.width + block.x - 1) / block.x; 970 patch.y = (winSize.height + block.y - 1) / block.y; 971 972 block.z = patch.z = 1; 973 } 974 975 bool lkSparse_run(UMat &I, UMat &J, const UMat &prevPts, UMat &nextPts, UMat &status, UMat& err, 976 int ptcount, int level) 977 { 978 size_t localThreads[3] = { 8, 8}; 979 size_t globalThreads[3] = { 8 * ptcount, 8}; 980 char calcErr = (0 == level) ? 1 : 0; 981 982 cv::String build_options; 983 if (isDeviceCPU()) 984 build_options = " -D CPU"; 985 else 986 build_options = cv::format("-D WAVE_SIZE=%d", waveSize); 987 988 ocl::Kernel kernel; 989 if (!kernel.create("lkSparse", cv::ocl::video::pyrlk_oclsrc, build_options)) 990 return false; 991 992 CV_Assert(I.depth() == CV_32F && J.depth() == CV_32F); 993 ocl::Image2D imageI(I, false, ocl::Image2D::canCreateAlias(I)); 994 ocl::Image2D imageJ(J, false, ocl::Image2D::canCreateAlias(J)); 995 996 int idxArg = 0; 997 idxArg = kernel.set(idxArg, imageI); //image2d_t I 998 idxArg = kernel.set(idxArg, imageJ); //image2d_t J 999 idxArg = kernel.set(idxArg, ocl::KernelArg::PtrReadOnly(prevPts)); // __global const float2* prevPts 1000 idxArg = kernel.set(idxArg, ocl::KernelArg::PtrReadWrite(nextPts)); // __global const float2* nextPts 1001 idxArg = kernel.set(idxArg, ocl::KernelArg::PtrReadWrite(status)); // __global uchar* status 1002 idxArg = kernel.set(idxArg, ocl::KernelArg::PtrReadWrite(err)); // __global float* err 1003 idxArg = kernel.set(idxArg, (int)level); // const int level 1004 idxArg = kernel.set(idxArg, (int)I.rows); // const int rows 1005 idxArg = kernel.set(idxArg, (int)I.cols); // const int cols 1006 idxArg = kernel.set(idxArg, (int)patch.x); // int PATCH_X 1007 idxArg = kernel.set(idxArg, (int)patch.y); // int PATCH_Y 1008 idxArg = kernel.set(idxArg, (int)winSize.width); // int c_winSize_x 1009 idxArg = kernel.set(idxArg, (int)winSize.height); // int c_winSize_y 1010 idxArg = kernel.set(idxArg, (int)iters); // int c_iters 1011 idxArg = kernel.set(idxArg, (char)calcErr); //char calcErr 1012 return kernel.run(2, globalThreads, localThreads, false); 1013 } 1014 private: 1015 inline static bool isDeviceCPU() 1016 { 1017 return (cv::ocl::Device::TYPE_CPU == cv::ocl::Device::getDefault().type()); 1018 } 1019 }; 1020 1021 1022 static bool ocl_calcOpticalFlowPyrLK(InputArray _prevImg, InputArray _nextImg, 1023 InputArray _prevPts, InputOutputArray _nextPts, 1024 OutputArray _status, OutputArray _err, 1025 Size winSize, int maxLevel, 1026 TermCriteria criteria, 1027 int flags/*, double minEigThreshold*/ ) 1028 { 1029 if (0 != (OPTFLOW_LK_GET_MIN_EIGENVALS & flags)) 1030 return false; 1031 if (!cv::ocl::Device::getDefault().imageSupport()) 1032 return false; 1033 if (_nextImg.size() != _prevImg.size()) 1034 return false; 1035 int typePrev = _prevImg.type(); 1036 int typeNext = _nextImg.type(); 1037 if ((1 != CV_MAT_CN(typePrev)) || (1 != CV_MAT_CN(typeNext))) 1038 return false; 1039 if ((0 != CV_MAT_DEPTH(typePrev)) || (0 != CV_MAT_DEPTH(typeNext))) 1040 return false; 1041 1042 if (_prevPts.empty() || _prevPts.type() != CV_32FC2 || (!_prevPts.isContinuous())) 1043 return false; 1044 if ((1 != _prevPts.size().height) && (1 != _prevPts.size().width)) 1045 return false; 1046 size_t npoints = _prevPts.total(); 1047 bool useInitialFlow = (0 != (flags & OPTFLOW_USE_INITIAL_FLOW)); 1048 if (useInitialFlow) 1049 { 1050 if (_nextPts.empty() || _nextPts.type() != CV_32FC2 || (!_prevPts.isContinuous())) 1051 return false; 1052 if ((1 != _nextPts.size().height) && (1 != _nextPts.size().width)) 1053 return false; 1054 if (_nextPts.total() != npoints) 1055 return false; 1056 } 1057 else 1058 { 1059 _nextPts.create(_prevPts.size(), _prevPts.type()); 1060 } 1061 1062 PyrLKOpticalFlow opticalFlow; 1063 opticalFlow.winSize = winSize; 1064 opticalFlow.maxLevel = maxLevel; 1065 opticalFlow.iters = criteria.maxCount; 1066 opticalFlow.derivLambda = criteria.epsilon; 1067 opticalFlow.useInitialFlow = useInitialFlow; 1068 1069 if (!opticalFlow.checkParam()) 1070 return false; 1071 1072 UMat umatErr; 1073 if (_err.needed()) 1074 { 1075 _err.create((int)npoints, 1, CV_32FC1); 1076 umatErr = _err.getUMat(); 1077 } 1078 else 1079 umatErr.create((int)npoints, 1, CV_32FC1); 1080 1081 _status.create((int)npoints, 1, CV_8UC1); 1082 UMat umatNextPts = _nextPts.getUMat(); 1083 UMat umatStatus = _status.getUMat(); 1084 return opticalFlow.sparse(_prevImg.getUMat(), _nextImg.getUMat(), _prevPts.getUMat(), umatNextPts, umatStatus, umatErr); 1085 } 1086 }; 1087 1088 void cv::calcOpticalFlowPyrLK( InputArray _prevImg, InputArray _nextImg, 1089 InputArray _prevPts, InputOutputArray _nextPts, 1090 OutputArray _status, OutputArray _err, 1091 Size winSize, int maxLevel, 1092 TermCriteria criteria, 1093 int flags, double minEigThreshold ) 1094 { 1095 bool use_opencl = ocl::useOpenCL() && 1096 (_prevImg.isUMat() || _nextImg.isUMat()) && 1097 ocl::Image2D::isFormatSupported(CV_32F, 1, false); 1098 if ( use_opencl && ocl_calcOpticalFlowPyrLK(_prevImg, _nextImg, _prevPts, _nextPts, _status, _err, winSize, maxLevel, criteria, flags/*, minEigThreshold*/)) 1099 { 1100 CV_IMPL_ADD(CV_IMPL_OCL); 1101 return; 1102 } 1103 1104 Mat prevPtsMat = _prevPts.getMat(); 1105 const int derivDepth = DataType<cv::detail::deriv_type>::depth; 1106 1107 CV_Assert( maxLevel >= 0 && winSize.width > 2 && winSize.height > 2 ); 1108 1109 int level=0, i, npoints; 1110 CV_Assert( (npoints = prevPtsMat.checkVector(2, CV_32F, true)) >= 0 ); 1111 1112 if( npoints == 0 ) 1113 { 1114 _nextPts.release(); 1115 _status.release(); 1116 _err.release(); 1117 return; 1118 } 1119 1120 if( !(flags & OPTFLOW_USE_INITIAL_FLOW) ) 1121 _nextPts.create(prevPtsMat.size(), prevPtsMat.type(), -1, true); 1122 1123 Mat nextPtsMat = _nextPts.getMat(); 1124 CV_Assert( nextPtsMat.checkVector(2, CV_32F, true) == npoints ); 1125 1126 const Point2f* prevPts = prevPtsMat.ptr<Point2f>(); 1127 Point2f* nextPts = nextPtsMat.ptr<Point2f>(); 1128 1129 _status.create((int)npoints, 1, CV_8U, -1, true); 1130 Mat statusMat = _status.getMat(), errMat; 1131 CV_Assert( statusMat.isContinuous() ); 1132 uchar* status = statusMat.ptr(); 1133 float* err = 0; 1134 1135 for( i = 0; i < npoints; i++ ) 1136 status[i] = true; 1137 1138 if( _err.needed() ) 1139 { 1140 _err.create((int)npoints, 1, CV_32F, -1, true); 1141 errMat = _err.getMat(); 1142 CV_Assert( errMat.isContinuous() ); 1143 err = errMat.ptr<float>(); 1144 } 1145 1146 std::vector<Mat> prevPyr, nextPyr; 1147 int levels1 = -1; 1148 int lvlStep1 = 1; 1149 int levels2 = -1; 1150 int lvlStep2 = 1; 1151 1152 if(_prevImg.kind() == _InputArray::STD_VECTOR_MAT) 1153 { 1154 _prevImg.getMatVector(prevPyr); 1155 1156 levels1 = int(prevPyr.size()) - 1; 1157 CV_Assert(levels1 >= 0); 1158 1159 if (levels1 % 2 == 1 && prevPyr[0].channels() * 2 == prevPyr[1].channels() && prevPyr[1].depth() == derivDepth) 1160 { 1161 lvlStep1 = 2; 1162 levels1 /= 2; 1163 } 1164 1165 // ensure that pyramid has reqired padding 1166 if(levels1 > 0) 1167 { 1168 Size fullSize; 1169 Point ofs; 1170 prevPyr[lvlStep1].locateROI(fullSize, ofs); 1171 CV_Assert(ofs.x >= winSize.width && ofs.y >= winSize.height 1172 && ofs.x + prevPyr[lvlStep1].cols + winSize.width <= fullSize.width 1173 && ofs.y + prevPyr[lvlStep1].rows + winSize.height <= fullSize.height); 1174 } 1175 1176 if(levels1 < maxLevel) 1177 maxLevel = levels1; 1178 } 1179 1180 if(_nextImg.kind() == _InputArray::STD_VECTOR_MAT) 1181 { 1182 _nextImg.getMatVector(nextPyr); 1183 1184 levels2 = int(nextPyr.size()) - 1; 1185 CV_Assert(levels2 >= 0); 1186 1187 if (levels2 % 2 == 1 && nextPyr[0].channels() * 2 == nextPyr[1].channels() && nextPyr[1].depth() == derivDepth) 1188 { 1189 lvlStep2 = 2; 1190 levels2 /= 2; 1191 } 1192 1193 // ensure that pyramid has reqired padding 1194 if(levels2 > 0) 1195 { 1196 Size fullSize; 1197 Point ofs; 1198 nextPyr[lvlStep2].locateROI(fullSize, ofs); 1199 CV_Assert(ofs.x >= winSize.width && ofs.y >= winSize.height 1200 && ofs.x + nextPyr[lvlStep2].cols + winSize.width <= fullSize.width 1201 && ofs.y + nextPyr[lvlStep2].rows + winSize.height <= fullSize.height); 1202 } 1203 1204 if(levels2 < maxLevel) 1205 maxLevel = levels2; 1206 } 1207 1208 if (levels1 < 0) 1209 maxLevel = buildOpticalFlowPyramid(_prevImg, prevPyr, winSize, maxLevel, false); 1210 1211 if (levels2 < 0) 1212 maxLevel = buildOpticalFlowPyramid(_nextImg, nextPyr, winSize, maxLevel, false); 1213 1214 if( (criteria.type & TermCriteria::COUNT) == 0 ) 1215 criteria.maxCount = 30; 1216 else 1217 criteria.maxCount = std::min(std::max(criteria.maxCount, 0), 100); 1218 if( (criteria.type & TermCriteria::EPS) == 0 ) 1219 criteria.epsilon = 0.01; 1220 else 1221 criteria.epsilon = std::min(std::max(criteria.epsilon, 0.), 10.); 1222 criteria.epsilon *= criteria.epsilon; 1223 1224 // dI/dx ~ Ix, dI/dy ~ Iy 1225 Mat derivIBuf; 1226 if(lvlStep1 == 1) 1227 derivIBuf.create(prevPyr[0].rows + winSize.height*2, prevPyr[0].cols + winSize.width*2, CV_MAKETYPE(derivDepth, prevPyr[0].channels() * 2)); 1228 1229 for( level = maxLevel; level >= 0; level-- ) 1230 { 1231 Mat derivI; 1232 if(lvlStep1 == 1) 1233 { 1234 Size imgSize = prevPyr[level * lvlStep1].size(); 1235 Mat _derivI( imgSize.height + winSize.height*2, 1236 imgSize.width + winSize.width*2, derivIBuf.type(), derivIBuf.ptr() ); 1237 derivI = _derivI(Rect(winSize.width, winSize.height, imgSize.width, imgSize.height)); 1238 calcSharrDeriv(prevPyr[level * lvlStep1], derivI); 1239 copyMakeBorder(derivI, _derivI, winSize.height, winSize.height, winSize.width, winSize.width, BORDER_CONSTANT|BORDER_ISOLATED); 1240 } 1241 else 1242 derivI = prevPyr[level * lvlStep1 + 1]; 1243 1244 CV_Assert(prevPyr[level * lvlStep1].size() == nextPyr[level * lvlStep2].size()); 1245 CV_Assert(prevPyr[level * lvlStep1].type() == nextPyr[level * lvlStep2].type()); 1246 1247 #ifdef HAVE_TEGRA_OPTIMIZATION 1248 typedef tegra::LKTrackerInvoker<cv::detail::LKTrackerInvoker> LKTrackerInvoker; 1249 #else 1250 typedef cv::detail::LKTrackerInvoker LKTrackerInvoker; 1251 #endif 1252 1253 parallel_for_(Range(0, npoints), LKTrackerInvoker(prevPyr[level * lvlStep1], derivI, 1254 nextPyr[level * lvlStep2], prevPts, nextPts, 1255 status, err, 1256 winSize, criteria, level, maxLevel, 1257 flags, (float)minEigThreshold)); 1258 } 1259 } 1260 1261 namespace cv 1262 { 1263 1264 static void 1265 getRTMatrix( const Point2f* a, const Point2f* b, 1266 int count, Mat& M, bool fullAffine ) 1267 { 1268 CV_Assert( M.isContinuous() ); 1269 1270 if( fullAffine ) 1271 { 1272 double sa[6][6]={{0.}}, sb[6]={0.}; 1273 Mat A( 6, 6, CV_64F, &sa[0][0] ), B( 6, 1, CV_64F, sb ); 1274 Mat MM = M.reshape(1, 6); 1275 1276 for( int i = 0; i < count; i++ ) 1277 { 1278 sa[0][0] += a[i].x*a[i].x; 1279 sa[0][1] += a[i].y*a[i].x; 1280 sa[0][2] += a[i].x; 1281 1282 sa[1][1] += a[i].y*a[i].y; 1283 sa[1][2] += a[i].y; 1284 1285 sa[2][2] += 1; 1286 1287 sb[0] += a[i].x*b[i].x; 1288 sb[1] += a[i].y*b[i].x; 1289 sb[2] += b[i].x; 1290 sb[3] += a[i].x*b[i].y; 1291 sb[4] += a[i].y*b[i].y; 1292 sb[5] += b[i].y; 1293 } 1294 1295 sa[3][4] = sa[4][3] = sa[1][0] = sa[0][1]; 1296 sa[3][5] = sa[5][3] = sa[2][0] = sa[0][2]; 1297 sa[4][5] = sa[5][4] = sa[2][1] = sa[1][2]; 1298 1299 sa[3][3] = sa[0][0]; 1300 sa[4][4] = sa[1][1]; 1301 sa[5][5] = sa[2][2]; 1302 1303 solve( A, B, MM, DECOMP_EIG ); 1304 } 1305 else 1306 { 1307 double sa[4][4]={{0.}}, sb[4]={0.}, m[4]; 1308 Mat A( 4, 4, CV_64F, sa ), B( 4, 1, CV_64F, sb ); 1309 Mat MM( 4, 1, CV_64F, m ); 1310 1311 for( int i = 0; i < count; i++ ) 1312 { 1313 sa[0][0] += a[i].x*a[i].x + a[i].y*a[i].y; 1314 sa[0][2] += a[i].x; 1315 sa[0][3] += a[i].y; 1316 1317 1318 sa[2][1] += -a[i].y; 1319 sa[2][2] += 1; 1320 1321 sa[3][0] += a[i].y; 1322 sa[3][1] += a[i].x; 1323 sa[3][3] += 1; 1324 1325 sb[0] += a[i].x*b[i].x + a[i].y*b[i].y; 1326 sb[1] += a[i].x*b[i].y - a[i].y*b[i].x; 1327 sb[2] += b[i].x; 1328 sb[3] += b[i].y; 1329 } 1330 1331 sa[1][1] = sa[0][0]; 1332 sa[2][1] = sa[1][2] = -sa[0][3]; 1333 sa[3][1] = sa[1][3] = sa[2][0] = sa[0][2]; 1334 sa[2][2] = sa[3][3] = count; 1335 sa[3][0] = sa[0][3]; 1336 1337 solve( A, B, MM, DECOMP_EIG ); 1338 1339 double* om = M.ptr<double>(); 1340 om[0] = om[4] = m[0]; 1341 om[1] = -m[1]; 1342 om[3] = m[1]; 1343 om[2] = m[2]; 1344 om[5] = m[3]; 1345 } 1346 } 1347 1348 } 1349 1350 cv::Mat cv::estimateRigidTransform( InputArray src1, InputArray src2, bool fullAffine ) 1351 { 1352 Mat M(2, 3, CV_64F), A = src1.getMat(), B = src2.getMat(); 1353 1354 const int COUNT = 15; 1355 const int WIDTH = 160, HEIGHT = 120; 1356 const int RANSAC_MAX_ITERS = 500; 1357 const int RANSAC_SIZE0 = 3; 1358 const double RANSAC_GOOD_RATIO = 0.5; 1359 1360 std::vector<Point2f> pA, pB; 1361 std::vector<int> good_idx; 1362 std::vector<uchar> status; 1363 1364 double scale = 1.; 1365 int i, j, k, k1; 1366 1367 RNG rng((uint64)-1); 1368 int good_count = 0; 1369 1370 if( A.size() != B.size() ) 1371 CV_Error( Error::StsUnmatchedSizes, "Both input images must have the same size" ); 1372 1373 if( A.type() != B.type() ) 1374 CV_Error( Error::StsUnmatchedFormats, "Both input images must have the same data type" ); 1375 1376 int count = A.checkVector(2); 1377 1378 if( count > 0 ) 1379 { 1380 A.reshape(2, count).convertTo(pA, CV_32F); 1381 B.reshape(2, count).convertTo(pB, CV_32F); 1382 } 1383 else if( A.depth() == CV_8U ) 1384 { 1385 int cn = A.channels(); 1386 CV_Assert( cn == 1 || cn == 3 || cn == 4 ); 1387 Size sz0 = A.size(); 1388 Size sz1(WIDTH, HEIGHT); 1389 1390 scale = std::max(1., std::max( (double)sz1.width/sz0.width, (double)sz1.height/sz0.height )); 1391 1392 sz1.width = cvRound( sz0.width * scale ); 1393 sz1.height = cvRound( sz0.height * scale ); 1394 1395 bool equalSizes = sz1.width == sz0.width && sz1.height == sz0.height; 1396 1397 if( !equalSizes || cn != 1 ) 1398 { 1399 Mat sA, sB; 1400 1401 if( cn != 1 ) 1402 { 1403 Mat gray; 1404 cvtColor(A, gray, COLOR_BGR2GRAY); 1405 resize(gray, sA, sz1, 0., 0., INTER_AREA); 1406 cvtColor(B, gray, COLOR_BGR2GRAY); 1407 resize(gray, sB, sz1, 0., 0., INTER_AREA); 1408 } 1409 else 1410 { 1411 resize(A, sA, sz1, 0., 0., INTER_AREA); 1412 resize(B, sB, sz1, 0., 0., INTER_AREA); 1413 } 1414 1415 A = sA; 1416 B = sB; 1417 } 1418 1419 int count_y = COUNT; 1420 int count_x = cvRound((double)COUNT*sz1.width/sz1.height); 1421 count = count_x * count_y; 1422 1423 pA.resize(count); 1424 pB.resize(count); 1425 status.resize(count); 1426 1427 for( i = 0, k = 0; i < count_y; i++ ) 1428 for( j = 0; j < count_x; j++, k++ ) 1429 { 1430 pA[k].x = (j+0.5f)*sz1.width/count_x; 1431 pA[k].y = (i+0.5f)*sz1.height/count_y; 1432 } 1433 1434 // find the corresponding points in B 1435 calcOpticalFlowPyrLK(A, B, pA, pB, status, noArray(), Size(21, 21), 3, 1436 TermCriteria(TermCriteria::MAX_ITER,40,0.1)); 1437 1438 // repack the remained points 1439 for( i = 0, k = 0; i < count; i++ ) 1440 if( status[i] ) 1441 { 1442 if( i > k ) 1443 { 1444 pA[k] = pA[i]; 1445 pB[k] = pB[i]; 1446 } 1447 k++; 1448 } 1449 count = k; 1450 pA.resize(count); 1451 pB.resize(count); 1452 } 1453 else 1454 CV_Error( Error::StsUnsupportedFormat, "Both input images must have either 8uC1 or 8uC3 type" ); 1455 1456 good_idx.resize(count); 1457 1458 if( count < RANSAC_SIZE0 ) 1459 return Mat(); 1460 1461 Rect brect = boundingRect(pB); 1462 1463 // RANSAC stuff: 1464 // 1. find the consensus 1465 for( k = 0; k < RANSAC_MAX_ITERS; k++ ) 1466 { 1467 int idx[RANSAC_SIZE0]; 1468 Point2f a[RANSAC_SIZE0]; 1469 Point2f b[RANSAC_SIZE0]; 1470 1471 // choose random 3 non-complanar points from A & B 1472 for( i = 0; i < RANSAC_SIZE0; i++ ) 1473 { 1474 for( k1 = 0; k1 < RANSAC_MAX_ITERS; k1++ ) 1475 { 1476 idx[i] = rng.uniform(0, count); 1477 1478 for( j = 0; j < i; j++ ) 1479 { 1480 if( idx[j] == idx[i] ) 1481 break; 1482 // check that the points are not very close one each other 1483 if( fabs(pA[idx[i]].x - pA[idx[j]].x) + 1484 fabs(pA[idx[i]].y - pA[idx[j]].y) < FLT_EPSILON ) 1485 break; 1486 if( fabs(pB[idx[i]].x - pB[idx[j]].x) + 1487 fabs(pB[idx[i]].y - pB[idx[j]].y) < FLT_EPSILON ) 1488 break; 1489 } 1490 1491 if( j < i ) 1492 continue; 1493 1494 if( i+1 == RANSAC_SIZE0 ) 1495 { 1496 // additional check for non-complanar vectors 1497 a[0] = pA[idx[0]]; 1498 a[1] = pA[idx[1]]; 1499 a[2] = pA[idx[2]]; 1500 1501 b[0] = pB[idx[0]]; 1502 b[1] = pB[idx[1]]; 1503 b[2] = pB[idx[2]]; 1504 1505 double dax1 = a[1].x - a[0].x, day1 = a[1].y - a[0].y; 1506 double dax2 = a[2].x - a[0].x, day2 = a[2].y - a[0].y; 1507 double dbx1 = b[1].x - b[0].x, dby1 = b[1].y - b[0].y; 1508 double dbx2 = b[2].x - b[0].x, dby2 = b[2].y - b[0].y; 1509 const double eps = 0.01; 1510 1511 if( fabs(dax1*day2 - day1*dax2) < eps*std::sqrt(dax1*dax1+day1*day1)*std::sqrt(dax2*dax2+day2*day2) || 1512 fabs(dbx1*dby2 - dby1*dbx2) < eps*std::sqrt(dbx1*dbx1+dby1*dby1)*std::sqrt(dbx2*dbx2+dby2*dby2) ) 1513 continue; 1514 } 1515 break; 1516 } 1517 1518 if( k1 >= RANSAC_MAX_ITERS ) 1519 break; 1520 } 1521 1522 if( i < RANSAC_SIZE0 ) 1523 continue; 1524 1525 // estimate the transformation using 3 points 1526 getRTMatrix( a, b, 3, M, fullAffine ); 1527 1528 const double* m = M.ptr<double>(); 1529 for( i = 0, good_count = 0; i < count; i++ ) 1530 { 1531 if( std::abs( m[0]*pA[i].x + m[1]*pA[i].y + m[2] - pB[i].x ) + 1532 std::abs( m[3]*pA[i].x + m[4]*pA[i].y + m[5] - pB[i].y ) < std::max(brect.width,brect.height)*0.05 ) 1533 good_idx[good_count++] = i; 1534 } 1535 1536 if( good_count >= count*RANSAC_GOOD_RATIO ) 1537 break; 1538 } 1539 1540 if( k >= RANSAC_MAX_ITERS ) 1541 return Mat(); 1542 1543 if( good_count < count ) 1544 { 1545 for( i = 0; i < good_count; i++ ) 1546 { 1547 j = good_idx[i]; 1548 pA[i] = pA[j]; 1549 pB[i] = pB[j]; 1550 } 1551 } 1552 1553 getRTMatrix( &pA[0], &pB[0], good_count, M, fullAffine ); 1554 M.at<double>(0, 2) /= scale; 1555 M.at<double>(1, 2) /= scale; 1556 1557 return M; 1558 } 1559 1560 /* End of file. */ 1561