1 /*M/////////////////////////////////////////////////////////////////////////////////////// 2 // 3 // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. 4 // 5 // By downloading, copying, installing or using the software you agree to this license. 6 // If you do not agree to this license, do not download, install, 7 // copy or use the software. 8 // 9 // 10 // License Agreement 11 // For Open Source Computer Vision Library 12 // 13 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved. 14 // Copyright (C) 2009, Willow Garage Inc., all rights reserved. 15 // Third party copyrights are property of their respective owners. 16 // 17 // Redistribution and use in source and binary forms, with or without modification, 18 // are permitted provided that the following conditions are met: 19 // 20 // * Redistribution's of source code must retain the above copyright notice, 21 // this list of conditions and the following disclaimer. 22 // 23 // * Redistribution's in binary form must reproduce the above copyright notice, 24 // this list of conditions and the following disclaimer in the documentation 25 // and/or other materials provided with the distribution. 26 // 27 // * The name of the copyright holders may not be used to endorse or promote products 28 // derived from this software without specific prior written permission. 29 // 30 // This software is provided by the copyright holders and contributors "as is" and 31 // any express or implied warranties, including, but not limited to, the implied 32 // warranties of merchantability and fitness for a particular purpose are disclaimed. 33 // In no event shall the Intel Corporation or contributors be liable for any direct, 34 // indirect, incidental, special, exemplary, or consequential damages 35 // (including, but not limited to, procurement of substitute goods or services; 36 // loss of use, data, or profits; or business interruption) however caused 37 // and on any theory of liability, whether in contract, strict liability, 38 // or tort (including negligence or otherwise) arising in any way out of 39 // the use of this software, even if advised of the possibility of such damage. 40 // 41 //M*/ 42 43 #if !defined CUDA_DISABLER 44 45 #include <thrust/device_ptr.h> 46 #include <thrust/transform.h> 47 48 #include "opencv2/core/cuda/common.hpp" 49 #include "opencv2/core/cuda/emulation.hpp" 50 #include "opencv2/core/cuda/vec_math.hpp" 51 #include "opencv2/core/cuda/functional.hpp" 52 53 #include "opencv2/opencv_modules.hpp" 54 55 #ifdef HAVE_OPENCV_CUDAARITHM 56 57 namespace cv { namespace cuda { namespace device 58 { 59 namespace ght 60 { 61 __device__ int g_counter; 62 63 template <typename T, int PIXELS_PER_THREAD> 64 __global__ void buildEdgePointList(const PtrStepSzb edges, const PtrStep<T> dx, const PtrStep<T> dy, 65 unsigned int* coordList, float* thetaList) 66 { 67 __shared__ unsigned int s_coordLists[4][32 * PIXELS_PER_THREAD]; 68 __shared__ float s_thetaLists[4][32 * PIXELS_PER_THREAD]; 69 __shared__ int s_sizes[4]; 70 __shared__ int s_globStart[4]; 71 72 const int x = blockIdx.x * blockDim.x * PIXELS_PER_THREAD + threadIdx.x; 73 const int y = blockIdx.y * blockDim.y + threadIdx.y; 74 75 if (threadIdx.x == 0) 76 s_sizes[threadIdx.y] = 0; 77 __syncthreads(); 78 79 if (y < edges.rows) 80 { 81 // fill the queue 82 const uchar* edgesRow = edges.ptr(y); 83 const T* dxRow = dx.ptr(y); 84 const T* dyRow = dy.ptr(y); 85 86 for (int i = 0, xx = x; i < PIXELS_PER_THREAD && xx < edges.cols; ++i, xx += blockDim.x) 87 { 88 const T dxVal = dxRow[xx]; 89 const T dyVal = dyRow[xx]; 90 91 if (edgesRow[xx] && (dxVal != 0 || dyVal != 0)) 92 { 93 const unsigned int coord = (y << 16) | xx; 94 95 float theta = ::atan2f(dyVal, dxVal); 96 if (theta < 0) 97 theta += 2.0f * CV_PI_F; 98 99 const int qidx = Emulation::smem::atomicAdd(&s_sizes[threadIdx.y], 1); 100 101 s_coordLists[threadIdx.y][qidx] = coord; 102 s_thetaLists[threadIdx.y][qidx] = theta; 103 } 104 } 105 } 106 107 __syncthreads(); 108 109 // let one thread reserve the space required in the global list 110 if (threadIdx.x == 0 && threadIdx.y == 0) 111 { 112 // find how many items are stored in each list 113 int totalSize = 0; 114 for (int i = 0; i < blockDim.y; ++i) 115 { 116 s_globStart[i] = totalSize; 117 totalSize += s_sizes[i]; 118 } 119 120 // calculate the offset in the global list 121 const int globalOffset = atomicAdd(&g_counter, totalSize); 122 for (int i = 0; i < blockDim.y; ++i) 123 s_globStart[i] += globalOffset; 124 } 125 126 __syncthreads(); 127 128 // copy local queues to global queue 129 const int qsize = s_sizes[threadIdx.y]; 130 int gidx = s_globStart[threadIdx.y] + threadIdx.x; 131 for(int i = threadIdx.x; i < qsize; i += blockDim.x, gidx += blockDim.x) 132 { 133 coordList[gidx] = s_coordLists[threadIdx.y][i]; 134 thetaList[gidx] = s_thetaLists[threadIdx.y][i]; 135 } 136 } 137 138 template <typename T> 139 int buildEdgePointList_gpu(PtrStepSzb edges, PtrStepSzb dx, PtrStepSzb dy, unsigned int* coordList, float* thetaList) 140 { 141 const int PIXELS_PER_THREAD = 8; 142 143 void* counterPtr; 144 cudaSafeCall( cudaGetSymbolAddress(&counterPtr, g_counter) ); 145 146 cudaSafeCall( cudaMemset(counterPtr, 0, sizeof(int)) ); 147 148 const dim3 block(32, 4); 149 const dim3 grid(divUp(edges.cols, block.x * PIXELS_PER_THREAD), divUp(edges.rows, block.y)); 150 151 cudaSafeCall( cudaFuncSetCacheConfig(buildEdgePointList<T, PIXELS_PER_THREAD>, cudaFuncCachePreferShared) ); 152 153 buildEdgePointList<T, PIXELS_PER_THREAD><<<grid, block>>>(edges, (PtrStepSz<T>) dx, (PtrStepSz<T>) dy, coordList, thetaList); 154 cudaSafeCall( cudaGetLastError() ); 155 156 cudaSafeCall( cudaDeviceSynchronize() ); 157 158 int totalCount; 159 cudaSafeCall( cudaMemcpy(&totalCount, counterPtr, sizeof(int), cudaMemcpyDeviceToHost) ); 160 161 return totalCount; 162 } 163 164 template int buildEdgePointList_gpu<short>(PtrStepSzb edges, PtrStepSzb dx, PtrStepSzb dy, unsigned int* coordList, float* thetaList); 165 template int buildEdgePointList_gpu<int>(PtrStepSzb edges, PtrStepSzb dx, PtrStepSzb dy, unsigned int* coordList, float* thetaList); 166 template int buildEdgePointList_gpu<float>(PtrStepSzb edges, PtrStepSzb dx, PtrStepSzb dy, unsigned int* coordList, float* thetaList); 167 168 __global__ void buildRTable(const unsigned int* coordList, const float* thetaList, const int pointsCount, 169 PtrStep<short2> r_table, int* r_sizes, int maxSize, 170 const short2 templCenter, const float thetaScale) 171 { 172 const int tid = blockIdx.x * blockDim.x + threadIdx.x; 173 174 if (tid >= pointsCount) 175 return; 176 177 const unsigned int coord = coordList[tid]; 178 short2 p; 179 p.x = (coord & 0xFFFF); 180 p.y = (coord >> 16) & 0xFFFF; 181 182 const float theta = thetaList[tid]; 183 const int n = __float2int_rn(theta * thetaScale); 184 185 const int ind = ::atomicAdd(r_sizes + n, 1); 186 if (ind < maxSize) 187 r_table(n, ind) = saturate_cast<short2>(p - templCenter); 188 } 189 190 void buildRTable_gpu(const unsigned int* coordList, const float* thetaList, int pointsCount, 191 PtrStepSz<short2> r_table, int* r_sizes, 192 short2 templCenter, int levels) 193 { 194 const dim3 block(256); 195 const dim3 grid(divUp(pointsCount, block.x)); 196 197 const float thetaScale = levels / (2.0f * CV_PI_F); 198 199 buildRTable<<<grid, block>>>(coordList, thetaList, pointsCount, r_table, r_sizes, r_table.cols, templCenter, thetaScale); 200 cudaSafeCall( cudaGetLastError() ); 201 202 cudaSafeCall( cudaDeviceSynchronize() ); 203 } 204 205 //////////////////////////////////////////////////////////////////////// 206 // Ballard_Pos 207 208 __global__ void Ballard_Pos_calcHist(const unsigned int* coordList, const float* thetaList, const int pointsCount, 209 const PtrStep<short2> r_table, const int* r_sizes, 210 PtrStepSzi hist, 211 const float idp, const float thetaScale) 212 { 213 const int tid = blockIdx.x * blockDim.x + threadIdx.x; 214 215 if (tid >= pointsCount) 216 return; 217 218 const unsigned int coord = coordList[tid]; 219 short2 p; 220 p.x = (coord & 0xFFFF); 221 p.y = (coord >> 16) & 0xFFFF; 222 223 const float theta = thetaList[tid]; 224 const int n = __float2int_rn(theta * thetaScale); 225 226 const short2* r_row = r_table.ptr(n); 227 const int r_row_size = r_sizes[n]; 228 229 for (int j = 0; j < r_row_size; ++j) 230 { 231 short2 c = saturate_cast<short2>(p - r_row[j]); 232 233 c.x = __float2int_rn(c.x * idp); 234 c.y = __float2int_rn(c.y * idp); 235 236 if (c.x >= 0 && c.x < hist.cols - 2 && c.y >= 0 && c.y < hist.rows - 2) 237 ::atomicAdd(hist.ptr(c.y + 1) + c.x + 1, 1); 238 } 239 } 240 241 void Ballard_Pos_calcHist_gpu(const unsigned int* coordList, const float* thetaList, int pointsCount, 242 PtrStepSz<short2> r_table, const int* r_sizes, 243 PtrStepSzi hist, 244 float dp, int levels) 245 { 246 const dim3 block(256); 247 const dim3 grid(divUp(pointsCount, block.x)); 248 249 const float idp = 1.0f / dp; 250 const float thetaScale = levels / (2.0f * CV_PI_F); 251 252 Ballard_Pos_calcHist<<<grid, block>>>(coordList, thetaList, pointsCount, r_table, r_sizes, hist, idp, thetaScale); 253 cudaSafeCall( cudaGetLastError() ); 254 255 cudaSafeCall( cudaDeviceSynchronize() ); 256 } 257 258 __global__ void Ballard_Pos_findPosInHist(const PtrStepSzi hist, float4* out, int3* votes, 259 const int maxSize, const float dp, const int threshold) 260 { 261 const int x = blockIdx.x * blockDim.x + threadIdx.x; 262 const int y = blockIdx.y * blockDim.y + threadIdx.y; 263 264 if (x >= hist.cols - 2 || y >= hist.rows - 2) 265 return; 266 267 const int curVotes = hist(y + 1, x + 1); 268 269 if (curVotes > threshold && 270 curVotes > hist(y + 1, x) && 271 curVotes >= hist(y + 1, x + 2) && 272 curVotes > hist(y, x + 1) && 273 curVotes >= hist(y + 2, x + 1)) 274 { 275 const int ind = ::atomicAdd(&g_counter, 1); 276 277 if (ind < maxSize) 278 { 279 out[ind] = make_float4(x * dp, y * dp, 1.0f, 0.0f); 280 votes[ind] = make_int3(curVotes, 0, 0); 281 } 282 } 283 } 284 285 int Ballard_Pos_findPosInHist_gpu(PtrStepSzi hist, float4* out, int3* votes, int maxSize, float dp, int threshold) 286 { 287 void* counterPtr; 288 cudaSafeCall( cudaGetSymbolAddress(&counterPtr, g_counter) ); 289 290 cudaSafeCall( cudaMemset(counterPtr, 0, sizeof(int)) ); 291 292 const dim3 block(32, 8); 293 const dim3 grid(divUp(hist.cols - 2, block.x), divUp(hist.rows - 2, block.y)); 294 295 cudaSafeCall( cudaFuncSetCacheConfig(Ballard_Pos_findPosInHist, cudaFuncCachePreferL1) ); 296 297 Ballard_Pos_findPosInHist<<<grid, block>>>(hist, out, votes, maxSize, dp, threshold); 298 cudaSafeCall( cudaGetLastError() ); 299 300 cudaSafeCall( cudaDeviceSynchronize() ); 301 302 int totalCount; 303 cudaSafeCall( cudaMemcpy(&totalCount, counterPtr, sizeof(int), cudaMemcpyDeviceToHost) ); 304 305 totalCount = ::min(totalCount, maxSize); 306 307 return totalCount; 308 } 309 310 //////////////////////////////////////////////////////////////////////// 311 // Guil_Full 312 313 struct FeatureTable 314 { 315 uchar* p1_pos_data; 316 size_t p1_pos_step; 317 318 uchar* p1_theta_data; 319 size_t p1_theta_step; 320 321 uchar* p2_pos_data; 322 size_t p2_pos_step; 323 324 uchar* d12_data; 325 size_t d12_step; 326 327 uchar* r1_data; 328 size_t r1_step; 329 330 uchar* r2_data; 331 size_t r2_step; 332 }; 333 334 __constant__ FeatureTable c_templFeatures; 335 __constant__ FeatureTable c_imageFeatures; 336 337 void Guil_Full_setTemplFeatures(PtrStepb p1_pos, PtrStepb p1_theta, PtrStepb p2_pos, PtrStepb d12, PtrStepb r1, PtrStepb r2) 338 { 339 FeatureTable tbl; 340 341 tbl.p1_pos_data = p1_pos.data; 342 tbl.p1_pos_step = p1_pos.step; 343 344 tbl.p1_theta_data = p1_theta.data; 345 tbl.p1_theta_step = p1_theta.step; 346 347 tbl.p2_pos_data = p2_pos.data; 348 tbl.p2_pos_step = p2_pos.step; 349 350 tbl.d12_data = d12.data; 351 tbl.d12_step = d12.step; 352 353 tbl.r1_data = r1.data; 354 tbl.r1_step = r1.step; 355 356 tbl.r2_data = r2.data; 357 tbl.r2_step = r2.step; 358 359 cudaSafeCall( cudaMemcpyToSymbol(c_templFeatures, &tbl, sizeof(FeatureTable)) ); 360 } 361 void Guil_Full_setImageFeatures(PtrStepb p1_pos, PtrStepb p1_theta, PtrStepb p2_pos, PtrStepb d12, PtrStepb r1, PtrStepb r2) 362 { 363 FeatureTable tbl; 364 365 tbl.p1_pos_data = p1_pos.data; 366 tbl.p1_pos_step = p1_pos.step; 367 368 tbl.p1_theta_data = p1_theta.data; 369 tbl.p1_theta_step = p1_theta.step; 370 371 tbl.p2_pos_data = p2_pos.data; 372 tbl.p2_pos_step = p2_pos.step; 373 374 tbl.d12_data = d12.data; 375 tbl.d12_step = d12.step; 376 377 tbl.r1_data = r1.data; 378 tbl.r1_step = r1.step; 379 380 tbl.r2_data = r2.data; 381 tbl.r2_step = r2.step; 382 383 cudaSafeCall( cudaMemcpyToSymbol(c_imageFeatures, &tbl, sizeof(FeatureTable)) ); 384 } 385 386 struct TemplFeatureTable 387 { 388 static __device__ float2* p1_pos(int n) 389 { 390 return (float2*)(c_templFeatures.p1_pos_data + n * c_templFeatures.p1_pos_step); 391 } 392 static __device__ float* p1_theta(int n) 393 { 394 return (float*)(c_templFeatures.p1_theta_data + n * c_templFeatures.p1_theta_step); 395 } 396 static __device__ float2* p2_pos(int n) 397 { 398 return (float2*)(c_templFeatures.p2_pos_data + n * c_templFeatures.p2_pos_step); 399 } 400 401 static __device__ float* d12(int n) 402 { 403 return (float*)(c_templFeatures.d12_data + n * c_templFeatures.d12_step); 404 } 405 406 static __device__ float2* r1(int n) 407 { 408 return (float2*)(c_templFeatures.r1_data + n * c_templFeatures.r1_step); 409 } 410 static __device__ float2* r2(int n) 411 { 412 return (float2*)(c_templFeatures.r2_data + n * c_templFeatures.r2_step); 413 } 414 }; 415 struct ImageFeatureTable 416 { 417 static __device__ float2* p1_pos(int n) 418 { 419 return (float2*)(c_imageFeatures.p1_pos_data + n * c_imageFeatures.p1_pos_step); 420 } 421 static __device__ float* p1_theta(int n) 422 { 423 return (float*)(c_imageFeatures.p1_theta_data + n * c_imageFeatures.p1_theta_step); 424 } 425 static __device__ float2* p2_pos(int n) 426 { 427 return (float2*)(c_imageFeatures.p2_pos_data + n * c_imageFeatures.p2_pos_step); 428 } 429 430 static __device__ float* d12(int n) 431 { 432 return (float*)(c_imageFeatures.d12_data + n * c_imageFeatures.d12_step); 433 } 434 435 static __device__ float2* r1(int n) 436 { 437 return (float2*)(c_imageFeatures.r1_data + n * c_imageFeatures.r1_step); 438 } 439 static __device__ float2* r2(int n) 440 { 441 return (float2*)(c_imageFeatures.r2_data + n * c_imageFeatures.r2_step); 442 } 443 }; 444 445 __device__ float clampAngle(float a) 446 { 447 float res = a; 448 449 while (res > 2.0f * CV_PI_F) 450 res -= 2.0f * CV_PI_F; 451 while (res < 0.0f) 452 res += 2.0f * CV_PI_F; 453 454 return res; 455 } 456 457 __device__ bool angleEq(float a, float b, float eps) 458 { 459 return (::fabs(clampAngle(a - b)) <= eps); 460 } 461 462 template <class FT, bool isTempl> 463 __global__ void Guil_Full_buildFeatureList(const unsigned int* coordList, const float* thetaList, const int pointsCount, 464 int* sizes, const int maxSize, 465 const float xi, const float angleEpsilon, const float alphaScale, 466 const float2 center, const float maxDist) 467 { 468 const float p1_theta = thetaList[blockIdx.x]; 469 const unsigned int coord1 = coordList[blockIdx.x]; 470 float2 p1_pos; 471 p1_pos.x = (coord1 & 0xFFFF); 472 p1_pos.y = (coord1 >> 16) & 0xFFFF; 473 474 for (int i = threadIdx.x; i < pointsCount; i += blockDim.x) 475 { 476 const float p2_theta = thetaList[i]; 477 const unsigned int coord2 = coordList[i]; 478 float2 p2_pos; 479 p2_pos.x = (coord2 & 0xFFFF); 480 p2_pos.y = (coord2 >> 16) & 0xFFFF; 481 482 if (angleEq(p1_theta - p2_theta, xi, angleEpsilon)) 483 { 484 const float2 d = p1_pos - p2_pos; 485 486 float alpha12 = clampAngle(::atan2(d.y, d.x) - p1_theta); 487 float d12 = ::sqrtf(d.x * d.x + d.y * d.y); 488 489 if (d12 > maxDist) 490 continue; 491 492 float2 r1 = p1_pos - center; 493 float2 r2 = p2_pos - center; 494 495 const int n = __float2int_rn(alpha12 * alphaScale); 496 497 const int ind = ::atomicAdd(sizes + n, 1); 498 499 if (ind < maxSize) 500 { 501 if (!isTempl) 502 { 503 FT::p1_pos(n)[ind] = p1_pos; 504 FT::p2_pos(n)[ind] = p2_pos; 505 } 506 507 FT::p1_theta(n)[ind] = p1_theta; 508 509 FT::d12(n)[ind] = d12; 510 511 if (isTempl) 512 { 513 FT::r1(n)[ind] = r1; 514 FT::r2(n)[ind] = r2; 515 } 516 } 517 } 518 } 519 } 520 521 template <class FT, bool isTempl> 522 void Guil_Full_buildFeatureList_caller(const unsigned int* coordList, const float* thetaList, int pointsCount, 523 int* sizes, int maxSize, 524 float xi, float angleEpsilon, int levels, 525 float2 center, float maxDist) 526 { 527 const dim3 block(256); 528 const dim3 grid(pointsCount); 529 530 const float alphaScale = levels / (2.0f * CV_PI_F); 531 532 Guil_Full_buildFeatureList<FT, isTempl><<<grid, block>>>(coordList, thetaList, pointsCount, 533 sizes, maxSize, 534 xi * (CV_PI_F / 180.0f), angleEpsilon * (CV_PI_F / 180.0f), alphaScale, 535 center, maxDist); 536 cudaSafeCall( cudaGetLastError() ); 537 538 cudaSafeCall( cudaDeviceSynchronize() ); 539 540 thrust::device_ptr<int> sizesPtr(sizes); 541 thrust::transform(sizesPtr, sizesPtr + levels + 1, sizesPtr, device::bind2nd(device::minimum<int>(), maxSize)); 542 } 543 544 void Guil_Full_buildTemplFeatureList_gpu(const unsigned int* coordList, const float* thetaList, int pointsCount, 545 int* sizes, int maxSize, 546 float xi, float angleEpsilon, int levels, 547 float2 center, float maxDist) 548 { 549 Guil_Full_buildFeatureList_caller<TemplFeatureTable, true>(coordList, thetaList, pointsCount, 550 sizes, maxSize, 551 xi, angleEpsilon, levels, 552 center, maxDist); 553 } 554 void Guil_Full_buildImageFeatureList_gpu(const unsigned int* coordList, const float* thetaList, int pointsCount, 555 int* sizes, int maxSize, 556 float xi, float angleEpsilon, int levels, 557 float2 center, float maxDist) 558 { 559 Guil_Full_buildFeatureList_caller<ImageFeatureTable, false>(coordList, thetaList, pointsCount, 560 sizes, maxSize, 561 xi, angleEpsilon, levels, 562 center, maxDist); 563 } 564 565 __global__ void Guil_Full_calcOHist(const int* templSizes, const int* imageSizes, int* OHist, 566 const float minAngle, const float maxAngle, const float iAngleStep, const int angleRange) 567 { 568 extern __shared__ int s_OHist[]; 569 for (int i = threadIdx.x; i <= angleRange; i += blockDim.x) 570 s_OHist[i] = 0; 571 __syncthreads(); 572 573 const int tIdx = blockIdx.x; 574 const int level = blockIdx.y; 575 576 const int tSize = templSizes[level]; 577 578 if (tIdx < tSize) 579 { 580 const int imSize = imageSizes[level]; 581 582 const float t_p1_theta = TemplFeatureTable::p1_theta(level)[tIdx]; 583 584 for (int i = threadIdx.x; i < imSize; i += blockDim.x) 585 { 586 const float im_p1_theta = ImageFeatureTable::p1_theta(level)[i]; 587 588 const float angle = clampAngle(im_p1_theta - t_p1_theta); 589 590 if (angle >= minAngle && angle <= maxAngle) 591 { 592 const int n = __float2int_rn((angle - minAngle) * iAngleStep); 593 Emulation::smem::atomicAdd(&s_OHist[n], 1); 594 } 595 } 596 } 597 __syncthreads(); 598 599 for (int i = threadIdx.x; i <= angleRange; i += blockDim.x) 600 ::atomicAdd(OHist + i, s_OHist[i]); 601 } 602 603 void Guil_Full_calcOHist_gpu(const int* templSizes, const int* imageSizes, int* OHist, 604 float minAngle, float maxAngle, float angleStep, int angleRange, 605 int levels, int tMaxSize) 606 { 607 const dim3 block(256); 608 const dim3 grid(tMaxSize, levels + 1); 609 610 minAngle *= (CV_PI_F / 180.0f); 611 maxAngle *= (CV_PI_F / 180.0f); 612 angleStep *= (CV_PI_F / 180.0f); 613 614 const size_t smemSize = (angleRange + 1) * sizeof(float); 615 616 Guil_Full_calcOHist<<<grid, block, smemSize>>>(templSizes, imageSizes, OHist, 617 minAngle, maxAngle, 1.0f / angleStep, angleRange); 618 cudaSafeCall( cudaGetLastError() ); 619 620 cudaSafeCall( cudaDeviceSynchronize() ); 621 } 622 623 __global__ void Guil_Full_calcSHist(const int* templSizes, const int* imageSizes, int* SHist, 624 const float angle, const float angleEpsilon, 625 const float minScale, const float maxScale, const float iScaleStep, const int scaleRange) 626 { 627 extern __shared__ int s_SHist[]; 628 for (int i = threadIdx.x; i <= scaleRange; i += blockDim.x) 629 s_SHist[i] = 0; 630 __syncthreads(); 631 632 const int tIdx = blockIdx.x; 633 const int level = blockIdx.y; 634 635 const int tSize = templSizes[level]; 636 637 if (tIdx < tSize) 638 { 639 const int imSize = imageSizes[level]; 640 641 const float t_p1_theta = TemplFeatureTable::p1_theta(level)[tIdx] + angle; 642 const float t_d12 = TemplFeatureTable::d12(level)[tIdx] + angle; 643 644 for (int i = threadIdx.x; i < imSize; i += blockDim.x) 645 { 646 const float im_p1_theta = ImageFeatureTable::p1_theta(level)[i]; 647 const float im_d12 = ImageFeatureTable::d12(level)[i]; 648 649 if (angleEq(im_p1_theta, t_p1_theta, angleEpsilon)) 650 { 651 const float scale = im_d12 / t_d12; 652 653 if (scale >= minScale && scale <= maxScale) 654 { 655 const int s = __float2int_rn((scale - minScale) * iScaleStep); 656 Emulation::smem::atomicAdd(&s_SHist[s], 1); 657 } 658 } 659 } 660 } 661 __syncthreads(); 662 663 for (int i = threadIdx.x; i <= scaleRange; i += blockDim.x) 664 ::atomicAdd(SHist + i, s_SHist[i]); 665 } 666 667 void Guil_Full_calcSHist_gpu(const int* templSizes, const int* imageSizes, int* SHist, 668 float angle, float angleEpsilon, 669 float minScale, float maxScale, float iScaleStep, int scaleRange, 670 int levels, int tMaxSize) 671 { 672 const dim3 block(256); 673 const dim3 grid(tMaxSize, levels + 1); 674 675 angle *= (CV_PI_F / 180.0f); 676 angleEpsilon *= (CV_PI_F / 180.0f); 677 678 const size_t smemSize = (scaleRange + 1) * sizeof(float); 679 680 Guil_Full_calcSHist<<<grid, block, smemSize>>>(templSizes, imageSizes, SHist, 681 angle, angleEpsilon, 682 minScale, maxScale, iScaleStep, scaleRange); 683 cudaSafeCall( cudaGetLastError() ); 684 685 cudaSafeCall( cudaDeviceSynchronize() ); 686 } 687 688 __global__ void Guil_Full_calcPHist(const int* templSizes, const int* imageSizes, PtrStepSzi PHist, 689 const float angle, const float sinVal, const float cosVal, const float angleEpsilon, const float scale, 690 const float idp) 691 { 692 const int tIdx = blockIdx.x; 693 const int level = blockIdx.y; 694 695 const int tSize = templSizes[level]; 696 697 if (tIdx < tSize) 698 { 699 const int imSize = imageSizes[level]; 700 701 const float t_p1_theta = TemplFeatureTable::p1_theta(level)[tIdx] + angle; 702 703 float2 r1 = TemplFeatureTable::r1(level)[tIdx]; 704 float2 r2 = TemplFeatureTable::r2(level)[tIdx]; 705 706 r1 = r1 * scale; 707 r2 = r2 * scale; 708 709 r1 = make_float2(cosVal * r1.x - sinVal * r1.y, sinVal * r1.x + cosVal * r1.y); 710 r2 = make_float2(cosVal * r2.x - sinVal * r2.y, sinVal * r2.x + cosVal * r2.y); 711 712 for (int i = threadIdx.x; i < imSize; i += blockDim.x) 713 { 714 const float im_p1_theta = ImageFeatureTable::p1_theta(level)[i]; 715 716 const float2 im_p1_pos = ImageFeatureTable::p1_pos(level)[i]; 717 const float2 im_p2_pos = ImageFeatureTable::p2_pos(level)[i]; 718 719 if (angleEq(im_p1_theta, t_p1_theta, angleEpsilon)) 720 { 721 float2 c1, c2; 722 723 c1 = im_p1_pos - r1; 724 c1 = c1 * idp; 725 726 c2 = im_p2_pos - r2; 727 c2 = c2 * idp; 728 729 if (::fabs(c1.x - c2.x) > 1 || ::fabs(c1.y - c2.y) > 1) 730 continue; 731 732 if (c1.y >= 0 && c1.y < PHist.rows - 2 && c1.x >= 0 && c1.x < PHist.cols - 2) 733 ::atomicAdd(PHist.ptr(__float2int_rn(c1.y) + 1) + __float2int_rn(c1.x) + 1, 1); 734 } 735 } 736 } 737 } 738 739 void Guil_Full_calcPHist_gpu(const int* templSizes, const int* imageSizes, PtrStepSzi PHist, 740 float angle, float angleEpsilon, float scale, 741 float dp, 742 int levels, int tMaxSize) 743 { 744 const dim3 block(256); 745 const dim3 grid(tMaxSize, levels + 1); 746 747 angle *= (CV_PI_F / 180.0f); 748 angleEpsilon *= (CV_PI_F / 180.0f); 749 750 const float sinVal = ::sinf(angle); 751 const float cosVal = ::cosf(angle); 752 753 cudaSafeCall( cudaFuncSetCacheConfig(Guil_Full_calcPHist, cudaFuncCachePreferL1) ); 754 755 Guil_Full_calcPHist<<<grid, block>>>(templSizes, imageSizes, PHist, 756 angle, sinVal, cosVal, angleEpsilon, scale, 757 1.0f / dp); 758 cudaSafeCall( cudaGetLastError() ); 759 760 cudaSafeCall( cudaDeviceSynchronize() ); 761 } 762 763 __global__ void Guil_Full_findPosInHist(const PtrStepSzi hist, float4* out, int3* votes, const int maxSize, 764 const float angle, const int angleVotes, const float scale, const int scaleVotes, 765 const float dp, const int threshold) 766 { 767 const int x = blockIdx.x * blockDim.x + threadIdx.x; 768 const int y = blockIdx.y * blockDim.y + threadIdx.y; 769 770 if (x >= hist.cols - 2 || y >= hist.rows - 2) 771 return; 772 773 const int curVotes = hist(y + 1, x + 1); 774 775 if (curVotes > threshold && 776 curVotes > hist(y + 1, x) && 777 curVotes >= hist(y + 1, x + 2) && 778 curVotes > hist(y, x + 1) && 779 curVotes >= hist(y + 2, x + 1)) 780 { 781 const int ind = ::atomicAdd(&g_counter, 1); 782 783 if (ind < maxSize) 784 { 785 out[ind] = make_float4(x * dp, y * dp, scale, angle); 786 votes[ind] = make_int3(curVotes, scaleVotes, angleVotes); 787 } 788 } 789 } 790 791 int Guil_Full_findPosInHist_gpu(PtrStepSzi hist, float4* out, int3* votes, int curSize, int maxSize, 792 float angle, int angleVotes, float scale, int scaleVotes, 793 float dp, int threshold) 794 { 795 void* counterPtr; 796 cudaSafeCall( cudaGetSymbolAddress(&counterPtr, g_counter) ); 797 798 cudaSafeCall( cudaMemcpy(counterPtr, &curSize, sizeof(int), cudaMemcpyHostToDevice) ); 799 800 const dim3 block(32, 8); 801 const dim3 grid(divUp(hist.cols - 2, block.x), divUp(hist.rows - 2, block.y)); 802 803 cudaSafeCall( cudaFuncSetCacheConfig(Guil_Full_findPosInHist, cudaFuncCachePreferL1) ); 804 805 Guil_Full_findPosInHist<<<grid, block>>>(hist, out, votes, maxSize, 806 angle, angleVotes, scale, scaleVotes, 807 dp, threshold); 808 cudaSafeCall( cudaGetLastError() ); 809 810 cudaSafeCall( cudaDeviceSynchronize() ); 811 812 int totalCount; 813 cudaSafeCall( cudaMemcpy(&totalCount, counterPtr, sizeof(int), cudaMemcpyDeviceToHost) ); 814 815 totalCount = ::min(totalCount, maxSize); 816 817 return totalCount; 818 } 819 } 820 }}} 821 822 #endif // HAVE_OPENCV_CUDAARITHM 823 824 #endif /* CUDA_DISABLER */ 825