1 /*M/////////////////////////////////////////////////////////////////////////////////////// 2 // 3 // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. 4 // 5 // By downloading, copying, installing or using the software you agree to this license. 6 // If you do not agree to this license, do not download, install, 7 // copy or use the software. 8 // 9 // 10 // License Agreement 11 // For Open Source Computer Vision Library 12 // 13 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved. 14 // Copyright (C) 2009, Willow Garage Inc., all rights reserved. 15 // Third party copyrights are property of their respective owners. 16 // 17 // Redistribution and use in source and binary forms, with or without modification, 18 // are permitted provided that the following conditions are met: 19 // 20 // * Redistribution's of source code must retain the above copyright notice, 21 // this list of conditions and the following disclaimer. 22 // 23 // * Redistribution's in binary form must reproduce the above copyright notice, 24 // this list of conditions and the following disclaimer in the documentation 25 // and/or other materials provided with the distribution. 26 // 27 // * The name of the copyright holders may not be used to endorse or promote products 28 // derived from this software without specific prior written permission. 29 // 30 // This software is provided by the copyright holders and contributors "as is" and 31 // any express or implied warranties, including, but not limited to, the implied 32 // warranties of merchantability and fitness for a particular purpose are disclaimed. 33 // In no event shall the Intel Corporation or contributors be liable for any direct, 34 // indirect, incidental, special, exemplary, or consequential damages 35 // (including, but not limited to, procurement of substitute goods or services; 36 // loss of use, data, or profits; or business interruption) however caused 37 // and on any theory of liability, whether in contract, strict liability, 38 // or tort (including negligence or otherwise) arising in any way out of 39 // the use of this software, even if advised of the possibility of such damage. 40 // 41 //M*/ 42 43 #if !defined CUDA_DISABLER 44 45 #include "opencv2/core/cuda/common.hpp" 46 #include "opencv2/core/cuda/limits.hpp" 47 #include "opencv2/core/cuda/functional.hpp" 48 #include "opencv2/core/cuda/reduce.hpp" 49 50 using namespace cv::cuda; 51 using namespace cv::cuda::device; 52 53 namespace optflowbm_fast 54 { 55 enum 56 { 57 CTA_SIZE = 128, 58 59 TILE_COLS = 128, 60 TILE_ROWS = 32, 61 62 STRIDE = CTA_SIZE 63 }; 64 65 template <typename T> __device__ __forceinline__ int calcDist(T a, T b) 66 { 67 return ::abs(a - b); 68 } 69 70 template <class T> struct FastOptFlowBM 71 { 72 73 int search_radius; 74 int block_radius; 75 76 int search_window; 77 int block_window; 78 79 PtrStepSz<T> I0; 80 PtrStep<T> I1; 81 82 mutable PtrStepi buffer; 83 84 FastOptFlowBM(int search_window_, int block_window_, 85 PtrStepSz<T> I0_, PtrStepSz<T> I1_, 86 PtrStepi buffer_) : 87 search_radius(search_window_ / 2), block_radius(block_window_ / 2), 88 search_window(search_window_), block_window(block_window_), 89 I0(I0_), I1(I1_), 90 buffer(buffer_) 91 { 92 } 93 94 __device__ __forceinline__ void initSums_BruteForce(int i, int j, int* dist_sums, PtrStepi& col_sums, PtrStepi& up_col_sums) const 95 { 96 for (int index = threadIdx.x; index < search_window * search_window; index += STRIDE) 97 { 98 dist_sums[index] = 0; 99 100 for (int tx = 0; tx < block_window; ++tx) 101 col_sums(tx, index) = 0; 102 103 int y = index / search_window; 104 int x = index - y * search_window; 105 106 int ay = i; 107 int ax = j; 108 109 int by = i + y - search_radius; 110 int bx = j + x - search_radius; 111 112 for (int tx = -block_radius; tx <= block_radius; ++tx) 113 { 114 int col_sum = 0; 115 for (int ty = -block_radius; ty <= block_radius; ++ty) 116 { 117 int dist = calcDist(I0(ay + ty, ax + tx), I1(by + ty, bx + tx)); 118 119 dist_sums[index] += dist; 120 col_sum += dist; 121 } 122 123 col_sums(tx + block_radius, index) = col_sum; 124 } 125 126 up_col_sums(j, index) = col_sums(block_window - 1, index); 127 } 128 } 129 130 __device__ __forceinline__ void shiftRight_FirstRow(int i, int j, int first, int* dist_sums, PtrStepi& col_sums, PtrStepi& up_col_sums) const 131 { 132 for (int index = threadIdx.x; index < search_window * search_window; index += STRIDE) 133 { 134 int y = index / search_window; 135 int x = index - y * search_window; 136 137 int ay = i; 138 int ax = j + block_radius; 139 140 int by = i + y - search_radius; 141 int bx = j + x - search_radius + block_radius; 142 143 int col_sum = 0; 144 145 for (int ty = -block_radius; ty <= block_radius; ++ty) 146 col_sum += calcDist(I0(ay + ty, ax), I1(by + ty, bx)); 147 148 dist_sums[index] += col_sum - col_sums(first, index); 149 150 col_sums(first, index) = col_sum; 151 up_col_sums(j, index) = col_sum; 152 } 153 } 154 155 __device__ __forceinline__ void shiftRight_UpSums(int i, int j, int first, int* dist_sums, PtrStepi& col_sums, PtrStepi& up_col_sums) const 156 { 157 int ay = i; 158 int ax = j + block_radius; 159 160 T a_up = I0(ay - block_radius - 1, ax); 161 T a_down = I0(ay + block_radius, ax); 162 163 for(int index = threadIdx.x; index < search_window * search_window; index += STRIDE) 164 { 165 int y = index / search_window; 166 int x = index - y * search_window; 167 168 int by = i + y - search_radius; 169 int bx = j + x - search_radius + block_radius; 170 171 T b_up = I1(by - block_radius - 1, bx); 172 T b_down = I1(by + block_radius, bx); 173 174 int col_sum = up_col_sums(j, index) + calcDist(a_down, b_down) - calcDist(a_up, b_up); 175 176 dist_sums[index] += col_sum - col_sums(first, index); 177 col_sums(first, index) = col_sum; 178 up_col_sums(j, index) = col_sum; 179 } 180 } 181 182 __device__ __forceinline__ void convolve_window(int i, int j, const int* dist_sums, float& velx, float& vely) const 183 { 184 int bestDist = numeric_limits<int>::max(); 185 int bestInd = -1; 186 187 for (int index = threadIdx.x; index < search_window * search_window; index += STRIDE) 188 { 189 int curDist = dist_sums[index]; 190 if (curDist < bestDist) 191 { 192 bestDist = curDist; 193 bestInd = index; 194 } 195 } 196 197 __shared__ int cta_dist_buffer[CTA_SIZE]; 198 __shared__ int cta_ind_buffer[CTA_SIZE]; 199 200 reduceKeyVal<CTA_SIZE>(cta_dist_buffer, bestDist, cta_ind_buffer, bestInd, threadIdx.x, less<int>()); 201 202 if (threadIdx.x == 0) 203 { 204 int y = bestInd / search_window; 205 int x = bestInd - y * search_window; 206 207 velx = x - search_radius; 208 vely = y - search_radius; 209 } 210 } 211 212 __device__ __forceinline__ void operator()(PtrStepf velx, PtrStepf vely) const 213 { 214 int tbx = blockIdx.x * TILE_COLS; 215 int tby = blockIdx.y * TILE_ROWS; 216 217 int tex = ::min(tbx + TILE_COLS, I0.cols); 218 int tey = ::min(tby + TILE_ROWS, I0.rows); 219 220 PtrStepi col_sums; 221 col_sums.data = buffer.ptr(I0.cols + blockIdx.x * block_window) + blockIdx.y * search_window * search_window; 222 col_sums.step = buffer.step; 223 224 PtrStepi up_col_sums; 225 up_col_sums.data = buffer.data + blockIdx.y * search_window * search_window; 226 up_col_sums.step = buffer.step; 227 228 extern __shared__ int dist_sums[]; //search_window * search_window 229 230 int first = 0; 231 232 for (int i = tby; i < tey; ++i) 233 { 234 for (int j = tbx; j < tex; ++j) 235 { 236 __syncthreads(); 237 238 if (j == tbx) 239 { 240 initSums_BruteForce(i, j, dist_sums, col_sums, up_col_sums); 241 first = 0; 242 } 243 else 244 { 245 if (i == tby) 246 shiftRight_FirstRow(i, j, first, dist_sums, col_sums, up_col_sums); 247 else 248 shiftRight_UpSums(i, j, first, dist_sums, col_sums, up_col_sums); 249 250 first = (first + 1) % block_window; 251 } 252 253 __syncthreads(); 254 255 convolve_window(i, j, dist_sums, velx(i, j), vely(i, j)); 256 } 257 } 258 } 259 260 }; 261 262 template<typename T> __global__ void optflowbm_fast_kernel(const FastOptFlowBM<T> fbm, PtrStepf velx, PtrStepf vely) 263 { 264 fbm(velx, vely); 265 } 266 267 void get_buffer_size(int src_cols, int src_rows, int search_window, int block_window, int& buffer_cols, int& buffer_rows) 268 { 269 dim3 grid(divUp(src_cols, TILE_COLS), divUp(src_rows, TILE_ROWS)); 270 271 buffer_cols = search_window * search_window * grid.y; 272 buffer_rows = src_cols + block_window * grid.x; 273 } 274 275 template <typename T> 276 void calc(PtrStepSzb I0, PtrStepSzb I1, PtrStepSzf velx, PtrStepSzf vely, PtrStepi buffer, int search_window, int block_window, cudaStream_t stream) 277 { 278 FastOptFlowBM<T> fbm(search_window, block_window, I0, I1, buffer); 279 280 dim3 block(CTA_SIZE, 1); 281 dim3 grid(divUp(I0.cols, TILE_COLS), divUp(I0.rows, TILE_ROWS)); 282 283 size_t smem = search_window * search_window * sizeof(int); 284 285 optflowbm_fast_kernel<<<grid, block, smem, stream>>>(fbm, velx, vely); 286 cudaSafeCall ( cudaGetLastError () ); 287 288 if (stream == 0) 289 cudaSafeCall( cudaDeviceSynchronize() ); 290 } 291 292 template void calc<uchar>(PtrStepSzb I0, PtrStepSzb I1, PtrStepSzf velx, PtrStepSzf vely, PtrStepi buffer, int search_window, int block_window, cudaStream_t stream); 293 } 294 295 #endif // !defined CUDA_DISABLER 296