1 /*M/////////////////////////////////////////////////////////////////////////////////////// 2 // 3 // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. 4 // 5 // By downloading, copying, installing or using the software you agree to this license. 6 // If you do not agree to this license, do not download, install, 7 // copy or use the software. 8 // 9 // 10 // License Agreement 11 // For Open Source Computer Vision Library 12 // 13 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved. 14 // Copyright (C) 2009, Willow Garage Inc., all rights reserved. 15 // Third party copyrights are property of their respective owners. 16 // 17 // Redistribution and use in source and binary forms, with or without modification, 18 // are permitted provided that the following conditions are met: 19 // 20 // * Redistribution's of source code must retain the above copyright notice, 21 // this list of conditions and the following disclaimer. 22 // 23 // * Redistribution's in binary form must reproduce the above copyright notice, 24 // this list of conditions and the following disclaimer in the documentation 25 // and/or other materials provided with the distribution. 26 // 27 // * The name of the copyright holders may not be used to endorse or promote products 28 // derived from this software without specific prior written permission. 29 // 30 // This software is provided by the copyright holders and contributors "as is" and 31 // any express or implied warranties, including, but not limited to, the implied 32 // warranties of merchantability and fitness for a particular purpose are disclaimed. 33 // In no event shall the Intel Corporation or contributors be liable for any direct, 34 // indirect, incidental, special, exemplary, or consequential damages 35 // (including, but not limited to, procurement of substitute goods or services; 36 // loss of use, data, or profits; or business interruption) however caused 37 // and on any theory of liability, whether in contract, strict liability, 38 // or tort (including negligence or otherwise) arising in any way out of 39 // the use of this software, even if advised of the possibility of such damage. 40 // 41 //M*/ 42 43 #include "precomp.hpp" 44 45 using namespace cv; 46 using namespace cv::cuda; 47 48 ///////////////////////////////////////////////////////////// 49 /// MemoryStack 50 51 #ifdef HAVE_CUDA 52 53 namespace 54 { 55 class MemoryPool; 56 57 class MemoryStack 58 { 59 public: 60 uchar* requestMemory(size_t size); 61 void returnMemory(uchar* ptr); 62 63 uchar* datastart; 64 uchar* dataend; 65 uchar* tip; 66 67 bool isFree; 68 MemoryPool* pool; 69 70 #if !defined(NDEBUG) 71 std::vector<size_t> allocations; 72 #endif 73 }; 74 75 uchar* MemoryStack::requestMemory(size_t size) 76 { 77 const size_t freeMem = dataend - tip; 78 79 if (size > freeMem) 80 return 0; 81 82 uchar* ptr = tip; 83 84 tip += size; 85 86 #if !defined(NDEBUG) 87 allocations.push_back(size); 88 #endif 89 90 return ptr; 91 } 92 93 void MemoryStack::returnMemory(uchar* ptr) 94 { 95 CV_DbgAssert( ptr >= datastart && ptr < dataend ); 96 97 #if !defined(NDEBUG) 98 const size_t allocSize = tip - ptr; 99 CV_Assert( allocSize == allocations.back() ); 100 allocations.pop_back(); 101 #endif 102 103 tip = ptr; 104 } 105 } 106 107 #endif 108 109 ///////////////////////////////////////////////////////////// 110 /// MemoryPool 111 112 #ifdef HAVE_CUDA 113 114 namespace 115 { 116 class MemoryPool 117 { 118 public: 119 MemoryPool(); 120 121 void initialize(size_t stackSize, int stackCount); 122 void release(); 123 124 MemoryStack* getFreeMemStack(); 125 void returnMemStack(MemoryStack* memStack); 126 127 private: 128 void initilizeImpl(); 129 130 Mutex mtx_; 131 132 bool initialized_; 133 size_t stackSize_; 134 int stackCount_; 135 136 uchar* mem_; 137 138 std::vector<MemoryStack> stacks_; 139 }; 140 141 MemoryPool::MemoryPool() : initialized_(false), mem_(0) 142 { 143 // default : 10 Mb, 5 stacks 144 stackSize_ = 10 * 1024 * 1024; 145 stackCount_ = 5; 146 } 147 148 void MemoryPool::initialize(size_t stackSize, int stackCount) 149 { 150 AutoLock lock(mtx_); 151 152 release(); 153 154 stackSize_ = stackSize; 155 stackCount_ = stackCount; 156 157 initilizeImpl(); 158 } 159 160 void MemoryPool::initilizeImpl() 161 { 162 const size_t totalSize = stackSize_ * stackCount_; 163 164 if (totalSize > 0) 165 { 166 cudaError_t err = cudaMalloc(&mem_, totalSize); 167 if (err != cudaSuccess) 168 return; 169 170 stacks_.resize(stackCount_); 171 172 uchar* ptr = mem_; 173 174 for (int i = 0; i < stackCount_; ++i) 175 { 176 stacks_[i].datastart = ptr; 177 stacks_[i].dataend = ptr + stackSize_; 178 stacks_[i].tip = ptr; 179 stacks_[i].isFree = true; 180 stacks_[i].pool = this; 181 182 ptr += stackSize_; 183 } 184 185 initialized_ = true; 186 } 187 } 188 189 void MemoryPool::release() 190 { 191 if (mem_) 192 { 193 #if !defined(NDEBUG) 194 for (int i = 0; i < stackCount_; ++i) 195 { 196 CV_DbgAssert( stacks_[i].isFree ); 197 CV_DbgAssert( stacks_[i].tip == stacks_[i].datastart ); 198 } 199 #endif 200 201 cudaFree(mem_); 202 203 mem_ = 0; 204 initialized_ = false; 205 } 206 } 207 208 MemoryStack* MemoryPool::getFreeMemStack() 209 { 210 AutoLock lock(mtx_); 211 212 if (!initialized_) 213 initilizeImpl(); 214 215 if (!mem_) 216 return 0; 217 218 for (int i = 0; i < stackCount_; ++i) 219 { 220 if (stacks_[i].isFree) 221 { 222 stacks_[i].isFree = false; 223 return &stacks_[i]; 224 } 225 } 226 227 return 0; 228 } 229 230 void MemoryPool::returnMemStack(MemoryStack* memStack) 231 { 232 AutoLock lock(mtx_); 233 234 CV_DbgAssert( !memStack->isFree ); 235 236 #if !defined(NDEBUG) 237 bool found = false; 238 for (int i = 0; i < stackCount_; ++i) 239 { 240 if (memStack == &stacks_[i]) 241 { 242 found = true; 243 break; 244 } 245 } 246 CV_DbgAssert( found ); 247 #endif 248 249 CV_DbgAssert( memStack->tip == memStack->datastart ); 250 251 memStack->isFree = true; 252 } 253 } 254 255 #endif 256 257 //////////////////////////////////////////////////////////////// 258 /// Stream::Impl 259 260 #ifndef HAVE_CUDA 261 262 class cv::cuda::Stream::Impl 263 { 264 public: 265 Impl(void* ptr = 0) 266 { 267 (void) ptr; 268 throw_no_cuda(); 269 } 270 }; 271 272 #else 273 274 namespace 275 { 276 class StackAllocator; 277 } 278 279 class cv::cuda::Stream::Impl 280 { 281 public: 282 cudaStream_t stream; 283 Ptr<StackAllocator> stackAllocator_; 284 285 Impl(); 286 Impl(cudaStream_t stream); 287 288 ~Impl(); 289 }; 290 291 cv::cuda::Stream::Impl::Impl() : stream(0) 292 { 293 cudaSafeCall( cudaStreamCreate(&stream) ); 294 295 stackAllocator_ = makePtr<StackAllocator>(stream); 296 } 297 298 cv::cuda::Stream::Impl::Impl(cudaStream_t stream_) : stream(stream_) 299 { 300 stackAllocator_ = makePtr<StackAllocator>(stream); 301 } 302 303 cv::cuda::Stream::Impl::~Impl() 304 { 305 stackAllocator_.release(); 306 307 if (stream) 308 cudaStreamDestroy(stream); 309 } 310 311 #endif 312 313 ///////////////////////////////////////////////////////////// 314 /// DefaultDeviceInitializer 315 316 #ifdef HAVE_CUDA 317 318 namespace cv { namespace cuda 319 { 320 class DefaultDeviceInitializer 321 { 322 public: 323 DefaultDeviceInitializer(); 324 ~DefaultDeviceInitializer(); 325 326 Stream& getNullStream(int deviceId); 327 MemoryPool* getMemoryPool(int deviceId); 328 329 private: 330 void initStreams(); 331 void initPools(); 332 333 std::vector<Ptr<Stream> > streams_; 334 Mutex streams_mtx_; 335 336 std::vector<MemoryPool> pools_; 337 Mutex pools_mtx_; 338 }; 339 340 DefaultDeviceInitializer::DefaultDeviceInitializer() 341 { 342 } 343 344 DefaultDeviceInitializer::~DefaultDeviceInitializer() 345 { 346 streams_.clear(); 347 348 for (size_t i = 0; i < pools_.size(); ++i) 349 { 350 cudaSetDevice(static_cast<int>(i)); 351 pools_[i].release(); 352 } 353 354 pools_.clear(); 355 } 356 357 Stream& DefaultDeviceInitializer::getNullStream(int deviceId) 358 { 359 AutoLock lock(streams_mtx_); 360 361 if (streams_.empty()) 362 { 363 int deviceCount = getCudaEnabledDeviceCount(); 364 365 if (deviceCount > 0) 366 streams_.resize(deviceCount); 367 } 368 369 CV_DbgAssert( deviceId >= 0 && deviceId < static_cast<int>(streams_.size()) ); 370 371 if (streams_[deviceId].empty()) 372 { 373 cudaStream_t stream = NULL; 374 Ptr<Stream::Impl> impl = makePtr<Stream::Impl>(stream); 375 streams_[deviceId] = Ptr<Stream>(new Stream(impl)); 376 } 377 378 return *streams_[deviceId]; 379 } 380 381 MemoryPool* DefaultDeviceInitializer::getMemoryPool(int deviceId) 382 { 383 AutoLock lock(pools_mtx_); 384 385 if (pools_.empty()) 386 { 387 int deviceCount = getCudaEnabledDeviceCount(); 388 389 if (deviceCount > 0) 390 pools_.resize(deviceCount); 391 } 392 393 CV_DbgAssert( deviceId >= 0 && deviceId < static_cast<int>(pools_.size()) ); 394 395 return &pools_[deviceId]; 396 } 397 398 DefaultDeviceInitializer initializer; 399 }} 400 401 #endif 402 403 ///////////////////////////////////////////////////////////// 404 /// Stream 405 406 cv::cuda::Stream::Stream() 407 { 408 #ifndef HAVE_CUDA 409 throw_no_cuda(); 410 #else 411 impl_ = makePtr<Impl>(); 412 #endif 413 } 414 415 bool cv::cuda::Stream::queryIfComplete() const 416 { 417 #ifndef HAVE_CUDA 418 throw_no_cuda(); 419 return false; 420 #else 421 cudaError_t err = cudaStreamQuery(impl_->stream); 422 423 if (err == cudaErrorNotReady || err == cudaSuccess) 424 return err == cudaSuccess; 425 426 cudaSafeCall(err); 427 return false; 428 #endif 429 } 430 431 void cv::cuda::Stream::waitForCompletion() 432 { 433 #ifndef HAVE_CUDA 434 throw_no_cuda(); 435 #else 436 cudaSafeCall( cudaStreamSynchronize(impl_->stream) ); 437 #endif 438 } 439 440 void cv::cuda::Stream::waitEvent(const Event& event) 441 { 442 #ifndef HAVE_CUDA 443 (void) event; 444 throw_no_cuda(); 445 #else 446 cudaSafeCall( cudaStreamWaitEvent(impl_->stream, EventAccessor::getEvent(event), 0) ); 447 #endif 448 } 449 450 #if defined(HAVE_CUDA) && (CUDART_VERSION >= 5000) 451 452 namespace 453 { 454 struct CallbackData 455 { 456 Stream::StreamCallback callback; 457 void* userData; 458 459 CallbackData(Stream::StreamCallback callback_, void* userData_) : callback(callback_), userData(userData_) {} 460 }; 461 462 void CUDART_CB cudaStreamCallback(cudaStream_t, cudaError_t status, void* userData) 463 { 464 CallbackData* data = reinterpret_cast<CallbackData*>(userData); 465 data->callback(static_cast<int>(status), data->userData); 466 delete data; 467 } 468 } 469 470 #endif 471 472 void cv::cuda::Stream::enqueueHostCallback(StreamCallback callback, void* userData) 473 { 474 #ifndef HAVE_CUDA 475 (void) callback; 476 (void) userData; 477 throw_no_cuda(); 478 #else 479 #if CUDART_VERSION < 5000 480 (void) callback; 481 (void) userData; 482 CV_Error(cv::Error::StsNotImplemented, "This function requires CUDA >= 5.0"); 483 #else 484 CallbackData* data = new CallbackData(callback, userData); 485 486 cudaSafeCall( cudaStreamAddCallback(impl_->stream, cudaStreamCallback, data, 0) ); 487 #endif 488 #endif 489 } 490 491 Stream& cv::cuda::Stream::Null() 492 { 493 #ifndef HAVE_CUDA 494 throw_no_cuda(); 495 static Stream stream; 496 return stream; 497 #else 498 const int deviceId = getDevice(); 499 return initializer.getNullStream(deviceId); 500 #endif 501 } 502 503 cv::cuda::Stream::operator bool_type() const 504 { 505 #ifndef HAVE_CUDA 506 return 0; 507 #else 508 return (impl_->stream != 0) ? &Stream::this_type_does_not_support_comparisons : 0; 509 #endif 510 } 511 512 #ifdef HAVE_CUDA 513 514 cudaStream_t cv::cuda::StreamAccessor::getStream(const Stream& stream) 515 { 516 return stream.impl_->stream; 517 } 518 519 #endif 520 521 ///////////////////////////////////////////////////////////// 522 /// StackAllocator 523 524 #ifdef HAVE_CUDA 525 526 namespace 527 { 528 bool enableMemoryPool = true; 529 530 class StackAllocator : public GpuMat::Allocator 531 { 532 public: 533 explicit StackAllocator(cudaStream_t stream); 534 ~StackAllocator(); 535 536 bool allocate(GpuMat* mat, int rows, int cols, size_t elemSize); 537 void free(GpuMat* mat); 538 539 private: 540 StackAllocator(const StackAllocator&); 541 StackAllocator& operator =(const StackAllocator&); 542 543 cudaStream_t stream_; 544 MemoryStack* memStack_; 545 size_t alignment_; 546 }; 547 548 StackAllocator::StackAllocator(cudaStream_t stream) : stream_(stream), memStack_(0) 549 { 550 if (enableMemoryPool) 551 { 552 const int deviceId = getDevice(); 553 memStack_ = initializer.getMemoryPool(deviceId)->getFreeMemStack(); 554 DeviceInfo devInfo(deviceId); 555 alignment_ = devInfo.textureAlignment(); 556 } 557 } 558 559 StackAllocator::~StackAllocator() 560 { 561 cudaStreamSynchronize(stream_); 562 563 if (memStack_ != 0) 564 memStack_->pool->returnMemStack(memStack_); 565 } 566 567 size_t alignUp(size_t what, size_t alignment) 568 { 569 size_t alignMask = alignment-1; 570 size_t inverseAlignMask = ~alignMask; 571 size_t res = (what + alignMask) & inverseAlignMask; 572 return res; 573 } 574 575 bool StackAllocator::allocate(GpuMat* mat, int rows, int cols, size_t elemSize) 576 { 577 if (memStack_ == 0) 578 return false; 579 580 size_t pitch, memSize; 581 582 if (rows > 1 && cols > 1) 583 { 584 pitch = alignUp(cols * elemSize, alignment_); 585 memSize = pitch * rows; 586 } 587 else 588 { 589 // Single row or single column must be continuous 590 pitch = elemSize * cols; 591 memSize = alignUp(elemSize * cols * rows, 64); 592 } 593 594 uchar* ptr = memStack_->requestMemory(memSize); 595 596 if (ptr == 0) 597 return false; 598 599 mat->data = ptr; 600 mat->step = pitch; 601 mat->refcount = (int*) fastMalloc(sizeof(int)); 602 603 return true; 604 } 605 606 void StackAllocator::free(GpuMat* mat) 607 { 608 if (memStack_ == 0) 609 return; 610 611 memStack_->returnMemory(mat->datastart); 612 fastFree(mat->refcount); 613 } 614 } 615 616 #endif 617 618 ///////////////////////////////////////////////////////////// 619 /// BufferPool 620 621 void cv::cuda::setBufferPoolUsage(bool on) 622 { 623 #ifndef HAVE_CUDA 624 (void)on; 625 throw_no_cuda(); 626 #else 627 enableMemoryPool = on; 628 #endif 629 } 630 631 void cv::cuda::setBufferPoolConfig(int deviceId, size_t stackSize, int stackCount) 632 { 633 #ifndef HAVE_CUDA 634 (void)deviceId; 635 (void)stackSize; 636 (void)stackCount; 637 throw_no_cuda(); 638 #else 639 const int currentDevice = getDevice(); 640 641 if (deviceId >= 0) 642 { 643 setDevice(deviceId); 644 initializer.getMemoryPool(deviceId)->initialize(stackSize, stackCount); 645 } 646 else 647 { 648 const int deviceCount = getCudaEnabledDeviceCount(); 649 650 for (deviceId = 0; deviceId < deviceCount; ++deviceId) 651 { 652 setDevice(deviceId); 653 initializer.getMemoryPool(deviceId)->initialize(stackSize, stackCount); 654 } 655 } 656 657 setDevice(currentDevice); 658 #endif 659 } 660 661 #ifdef HAVE_CUDA 662 663 cv::cuda::BufferPool::BufferPool(Stream& stream) : allocator_(stream.impl_->stackAllocator_.get()) 664 { 665 } 666 667 GpuMat cv::cuda::BufferPool::getBuffer(int rows, int cols, int type) 668 { 669 GpuMat buf(allocator_); 670 buf.create(rows, cols, type); 671 return buf; 672 } 673 674 #endif 675 676 //////////////////////////////////////////////////////////////// 677 // Event 678 679 #ifndef HAVE_CUDA 680 681 class cv::cuda::Event::Impl 682 { 683 public: 684 Impl(unsigned int) 685 { 686 throw_no_cuda(); 687 } 688 }; 689 690 #else 691 692 class cv::cuda::Event::Impl 693 { 694 public: 695 cudaEvent_t event; 696 697 Impl(unsigned int flags); 698 ~Impl(); 699 }; 700 701 cv::cuda::Event::Impl::Impl(unsigned int flags) : event(0) 702 { 703 cudaSafeCall( cudaEventCreateWithFlags(&event, flags) ); 704 } 705 706 cv::cuda::Event::Impl::~Impl() 707 { 708 if (event) 709 cudaEventDestroy(event); 710 } 711 712 cudaEvent_t cv::cuda::EventAccessor::getEvent(const Event& event) 713 { 714 return event.impl_->event; 715 } 716 717 #endif 718 719 cv::cuda::Event::Event(CreateFlags flags) 720 { 721 #ifndef HAVE_CUDA 722 (void) flags; 723 throw_no_cuda(); 724 #else 725 impl_ = makePtr<Impl>(flags); 726 #endif 727 } 728 729 void cv::cuda::Event::record(Stream& stream) 730 { 731 #ifndef HAVE_CUDA 732 (void) stream; 733 throw_no_cuda(); 734 #else 735 cudaSafeCall( cudaEventRecord(impl_->event, StreamAccessor::getStream(stream)) ); 736 #endif 737 } 738 739 bool cv::cuda::Event::queryIfComplete() const 740 { 741 #ifndef HAVE_CUDA 742 throw_no_cuda(); 743 return false; 744 #else 745 cudaError_t err = cudaEventQuery(impl_->event); 746 747 if (err == cudaErrorNotReady || err == cudaSuccess) 748 return err == cudaSuccess; 749 750 cudaSafeCall(err); 751 return false; 752 #endif 753 } 754 755 void cv::cuda::Event::waitForCompletion() 756 { 757 #ifndef HAVE_CUDA 758 throw_no_cuda(); 759 #else 760 cudaSafeCall( cudaEventSynchronize(impl_->event) ); 761 #endif 762 } 763 764 float cv::cuda::Event::elapsedTime(const Event& start, const Event& end) 765 { 766 #ifndef HAVE_CUDA 767 (void) start; 768 (void) end; 769 throw_no_cuda(); 770 return 0.0f; 771 #else 772 float ms; 773 cudaSafeCall( cudaEventElapsedTime(&ms, start.impl_->event, end.impl_->event) ); 774 return ms; 775 #endif 776 } 777