Home | History | Annotate | Download | only in src
      1 /*M///////////////////////////////////////////////////////////////////////////////////////
      2 //
      3 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
      4 //
      5 //  By downloading, copying, installing or using the software you agree to this license.
      6 //  If you do not agree to this license, do not download, install,
      7 //  copy or use the software.
      8 //
      9 //
     10 //                           License Agreement
     11 //                For Open Source Computer Vision Library
     12 //
     13 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
     14 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
     15 // Third party copyrights are property of their respective owners.
     16 //
     17 // Redistribution and use in source and binary forms, with or without modification,
     18 // are permitted provided that the following conditions are met:
     19 //
     20 //   * Redistribution's of source code must retain the above copyright notice,
     21 //     this list of conditions and the following disclaimer.
     22 //
     23 //   * Redistribution's in binary form must reproduce the above copyright notice,
     24 //     this list of conditions and the following disclaimer in the documentation
     25 //     and/or other materials provided with the distribution.
     26 //
     27 //   * The name of the copyright holders may not be used to endorse or promote products
     28 //     derived from this software without specific prior written permission.
     29 //
     30 // This software is provided by the copyright holders and contributors "as is" and
     31 // any express or implied warranties, including, but not limited to, the implied
     32 // warranties of merchantability and fitness for a particular purpose are disclaimed.
     33 // In no event shall the Intel Corporation or contributors be liable for any direct,
     34 // indirect, incidental, special, exemplary, or consequential damages
     35 // (including, but not limited to, procurement of substitute goods or services;
     36 // loss of use, data, or profits; or business interruption) however caused
     37 // and on any theory of liability, whether in contract, strict liability,
     38 // or tort (including negligence or otherwise) arising in any way out of
     39 // the use of this software, even if advised of the possibility of such damage.
     40 //
     41 //M*/
     42 
     43 #include "precomp.hpp"
     44 
     45 using namespace cv;
     46 using namespace cv::cuda;
     47 
     48 /////////////////////////////////////////////////////////////
     49 /// MemoryStack
     50 
     51 #ifdef HAVE_CUDA
     52 
     53 namespace
     54 {
     55     class MemoryPool;
     56 
     57     class MemoryStack
     58     {
     59     public:
     60         uchar* requestMemory(size_t size);
     61         void returnMemory(uchar* ptr);
     62 
     63         uchar* datastart;
     64         uchar* dataend;
     65         uchar* tip;
     66 
     67         bool isFree;
     68         MemoryPool* pool;
     69 
     70     #if !defined(NDEBUG)
     71         std::vector<size_t> allocations;
     72     #endif
     73     };
     74 
     75     uchar* MemoryStack::requestMemory(size_t size)
     76     {
     77         const size_t freeMem = dataend - tip;
     78 
     79         if (size > freeMem)
     80             return 0;
     81 
     82         uchar* ptr = tip;
     83 
     84         tip += size;
     85 
     86     #if !defined(NDEBUG)
     87         allocations.push_back(size);
     88     #endif
     89 
     90         return ptr;
     91     }
     92 
     93     void MemoryStack::returnMemory(uchar* ptr)
     94     {
     95         CV_DbgAssert( ptr >= datastart && ptr < dataend );
     96 
     97     #if !defined(NDEBUG)
     98         const size_t allocSize = tip - ptr;
     99         CV_Assert( allocSize == allocations.back() );
    100         allocations.pop_back();
    101     #endif
    102 
    103         tip = ptr;
    104     }
    105 }
    106 
    107 #endif
    108 
    109 /////////////////////////////////////////////////////////////
    110 /// MemoryPool
    111 
    112 #ifdef HAVE_CUDA
    113 
    114 namespace
    115 {
    116     class MemoryPool
    117     {
    118     public:
    119         MemoryPool();
    120 
    121         void initialize(size_t stackSize, int stackCount);
    122         void release();
    123 
    124         MemoryStack* getFreeMemStack();
    125         void returnMemStack(MemoryStack* memStack);
    126 
    127     private:
    128         void initilizeImpl();
    129 
    130         Mutex mtx_;
    131 
    132         bool initialized_;
    133         size_t stackSize_;
    134         int stackCount_;
    135 
    136         uchar* mem_;
    137 
    138         std::vector<MemoryStack> stacks_;
    139     };
    140 
    141     MemoryPool::MemoryPool() : initialized_(false), mem_(0)
    142     {
    143         // default : 10 Mb, 5 stacks
    144         stackSize_ = 10 * 1024 * 1024;
    145         stackCount_ = 5;
    146     }
    147 
    148     void MemoryPool::initialize(size_t stackSize, int stackCount)
    149     {
    150         AutoLock lock(mtx_);
    151 
    152         release();
    153 
    154         stackSize_ = stackSize;
    155         stackCount_ = stackCount;
    156 
    157         initilizeImpl();
    158     }
    159 
    160     void MemoryPool::initilizeImpl()
    161     {
    162         const size_t totalSize = stackSize_ * stackCount_;
    163 
    164         if (totalSize > 0)
    165         {
    166             cudaError_t err = cudaMalloc(&mem_, totalSize);
    167             if (err != cudaSuccess)
    168                 return;
    169 
    170             stacks_.resize(stackCount_);
    171 
    172             uchar* ptr = mem_;
    173 
    174             for (int i = 0; i < stackCount_; ++i)
    175             {
    176                 stacks_[i].datastart = ptr;
    177                 stacks_[i].dataend = ptr + stackSize_;
    178                 stacks_[i].tip = ptr;
    179                 stacks_[i].isFree = true;
    180                 stacks_[i].pool = this;
    181 
    182                 ptr += stackSize_;
    183             }
    184 
    185             initialized_ = true;
    186         }
    187     }
    188 
    189     void MemoryPool::release()
    190     {
    191         if (mem_)
    192         {
    193 #if !defined(NDEBUG)
    194             for (int i = 0; i < stackCount_; ++i)
    195             {
    196                 CV_DbgAssert( stacks_[i].isFree );
    197                 CV_DbgAssert( stacks_[i].tip == stacks_[i].datastart );
    198             }
    199 #endif
    200 
    201             cudaFree(mem_);
    202 
    203             mem_ = 0;
    204             initialized_ = false;
    205         }
    206     }
    207 
    208     MemoryStack* MemoryPool::getFreeMemStack()
    209     {
    210         AutoLock lock(mtx_);
    211 
    212         if (!initialized_)
    213             initilizeImpl();
    214 
    215         if (!mem_)
    216             return 0;
    217 
    218         for (int i = 0; i < stackCount_; ++i)
    219         {
    220             if (stacks_[i].isFree)
    221             {
    222                 stacks_[i].isFree = false;
    223                 return &stacks_[i];
    224             }
    225         }
    226 
    227         return 0;
    228     }
    229 
    230     void MemoryPool::returnMemStack(MemoryStack* memStack)
    231     {
    232         AutoLock lock(mtx_);
    233 
    234         CV_DbgAssert( !memStack->isFree );
    235 
    236 #if !defined(NDEBUG)
    237         bool found = false;
    238         for (int i = 0; i < stackCount_; ++i)
    239         {
    240             if (memStack == &stacks_[i])
    241             {
    242                 found = true;
    243                 break;
    244             }
    245         }
    246         CV_DbgAssert( found );
    247 #endif
    248 
    249         CV_DbgAssert( memStack->tip == memStack->datastart );
    250 
    251         memStack->isFree = true;
    252     }
    253 }
    254 
    255 #endif
    256 
    257 ////////////////////////////////////////////////////////////////
    258 /// Stream::Impl
    259 
    260 #ifndef HAVE_CUDA
    261 
    262 class cv::cuda::Stream::Impl
    263 {
    264 public:
    265     Impl(void* ptr = 0)
    266     {
    267         (void) ptr;
    268         throw_no_cuda();
    269     }
    270 };
    271 
    272 #else
    273 
    274 namespace
    275 {
    276     class StackAllocator;
    277 }
    278 
    279 class cv::cuda::Stream::Impl
    280 {
    281 public:
    282     cudaStream_t stream;
    283     Ptr<StackAllocator> stackAllocator_;
    284 
    285     Impl();
    286     Impl(cudaStream_t stream);
    287 
    288     ~Impl();
    289 };
    290 
    291 cv::cuda::Stream::Impl::Impl() : stream(0)
    292 {
    293     cudaSafeCall( cudaStreamCreate(&stream) );
    294 
    295     stackAllocator_ = makePtr<StackAllocator>(stream);
    296 }
    297 
    298 cv::cuda::Stream::Impl::Impl(cudaStream_t stream_) : stream(stream_)
    299 {
    300     stackAllocator_ = makePtr<StackAllocator>(stream);
    301 }
    302 
    303 cv::cuda::Stream::Impl::~Impl()
    304 {
    305     stackAllocator_.release();
    306 
    307     if (stream)
    308         cudaStreamDestroy(stream);
    309 }
    310 
    311 #endif
    312 
    313 /////////////////////////////////////////////////////////////
    314 /// DefaultDeviceInitializer
    315 
    316 #ifdef HAVE_CUDA
    317 
    318 namespace cv { namespace cuda
    319 {
    320     class DefaultDeviceInitializer
    321     {
    322     public:
    323         DefaultDeviceInitializer();
    324         ~DefaultDeviceInitializer();
    325 
    326         Stream& getNullStream(int deviceId);
    327         MemoryPool* getMemoryPool(int deviceId);
    328 
    329     private:
    330         void initStreams();
    331         void initPools();
    332 
    333         std::vector<Ptr<Stream> > streams_;
    334         Mutex streams_mtx_;
    335 
    336         std::vector<MemoryPool> pools_;
    337         Mutex pools_mtx_;
    338     };
    339 
    340     DefaultDeviceInitializer::DefaultDeviceInitializer()
    341     {
    342     }
    343 
    344     DefaultDeviceInitializer::~DefaultDeviceInitializer()
    345     {
    346         streams_.clear();
    347 
    348         for (size_t i = 0; i < pools_.size(); ++i)
    349         {
    350             cudaSetDevice(static_cast<int>(i));
    351             pools_[i].release();
    352         }
    353 
    354         pools_.clear();
    355     }
    356 
    357     Stream& DefaultDeviceInitializer::getNullStream(int deviceId)
    358     {
    359         AutoLock lock(streams_mtx_);
    360 
    361         if (streams_.empty())
    362         {
    363             int deviceCount = getCudaEnabledDeviceCount();
    364 
    365             if (deviceCount > 0)
    366                 streams_.resize(deviceCount);
    367         }
    368 
    369         CV_DbgAssert( deviceId >= 0 && deviceId < static_cast<int>(streams_.size()) );
    370 
    371         if (streams_[deviceId].empty())
    372         {
    373             cudaStream_t stream = NULL;
    374             Ptr<Stream::Impl> impl = makePtr<Stream::Impl>(stream);
    375             streams_[deviceId] = Ptr<Stream>(new Stream(impl));
    376         }
    377 
    378         return *streams_[deviceId];
    379     }
    380 
    381     MemoryPool* DefaultDeviceInitializer::getMemoryPool(int deviceId)
    382     {
    383         AutoLock lock(pools_mtx_);
    384 
    385         if (pools_.empty())
    386         {
    387             int deviceCount = getCudaEnabledDeviceCount();
    388 
    389             if (deviceCount > 0)
    390                 pools_.resize(deviceCount);
    391         }
    392 
    393         CV_DbgAssert( deviceId >= 0 && deviceId < static_cast<int>(pools_.size()) );
    394 
    395         return &pools_[deviceId];
    396     }
    397 
    398     DefaultDeviceInitializer initializer;
    399 }}
    400 
    401 #endif
    402 
    403 /////////////////////////////////////////////////////////////
    404 /// Stream
    405 
    406 cv::cuda::Stream::Stream()
    407 {
    408 #ifndef HAVE_CUDA
    409     throw_no_cuda();
    410 #else
    411     impl_ = makePtr<Impl>();
    412 #endif
    413 }
    414 
    415 bool cv::cuda::Stream::queryIfComplete() const
    416 {
    417 #ifndef HAVE_CUDA
    418     throw_no_cuda();
    419     return false;
    420 #else
    421     cudaError_t err = cudaStreamQuery(impl_->stream);
    422 
    423     if (err == cudaErrorNotReady || err == cudaSuccess)
    424         return err == cudaSuccess;
    425 
    426     cudaSafeCall(err);
    427     return false;
    428 #endif
    429 }
    430 
    431 void cv::cuda::Stream::waitForCompletion()
    432 {
    433 #ifndef HAVE_CUDA
    434     throw_no_cuda();
    435 #else
    436     cudaSafeCall( cudaStreamSynchronize(impl_->stream) );
    437 #endif
    438 }
    439 
    440 void cv::cuda::Stream::waitEvent(const Event& event)
    441 {
    442 #ifndef HAVE_CUDA
    443     (void) event;
    444     throw_no_cuda();
    445 #else
    446     cudaSafeCall( cudaStreamWaitEvent(impl_->stream, EventAccessor::getEvent(event), 0) );
    447 #endif
    448 }
    449 
    450 #if defined(HAVE_CUDA) && (CUDART_VERSION >= 5000)
    451 
    452 namespace
    453 {
    454     struct CallbackData
    455     {
    456         Stream::StreamCallback callback;
    457         void* userData;
    458 
    459         CallbackData(Stream::StreamCallback callback_, void* userData_) : callback(callback_), userData(userData_) {}
    460     };
    461 
    462     void CUDART_CB cudaStreamCallback(cudaStream_t, cudaError_t status, void* userData)
    463     {
    464         CallbackData* data = reinterpret_cast<CallbackData*>(userData);
    465         data->callback(static_cast<int>(status), data->userData);
    466         delete data;
    467     }
    468 }
    469 
    470 #endif
    471 
    472 void cv::cuda::Stream::enqueueHostCallback(StreamCallback callback, void* userData)
    473 {
    474 #ifndef HAVE_CUDA
    475     (void) callback;
    476     (void) userData;
    477     throw_no_cuda();
    478 #else
    479     #if CUDART_VERSION < 5000
    480         (void) callback;
    481         (void) userData;
    482         CV_Error(cv::Error::StsNotImplemented, "This function requires CUDA >= 5.0");
    483     #else
    484         CallbackData* data = new CallbackData(callback, userData);
    485 
    486         cudaSafeCall( cudaStreamAddCallback(impl_->stream, cudaStreamCallback, data, 0) );
    487     #endif
    488 #endif
    489 }
    490 
    491 Stream& cv::cuda::Stream::Null()
    492 {
    493 #ifndef HAVE_CUDA
    494     throw_no_cuda();
    495     static Stream stream;
    496     return stream;
    497 #else
    498     const int deviceId = getDevice();
    499     return initializer.getNullStream(deviceId);
    500 #endif
    501 }
    502 
    503 cv::cuda::Stream::operator bool_type() const
    504 {
    505 #ifndef HAVE_CUDA
    506     return 0;
    507 #else
    508     return (impl_->stream != 0) ? &Stream::this_type_does_not_support_comparisons : 0;
    509 #endif
    510 }
    511 
    512 #ifdef HAVE_CUDA
    513 
    514 cudaStream_t cv::cuda::StreamAccessor::getStream(const Stream& stream)
    515 {
    516     return stream.impl_->stream;
    517 }
    518 
    519 #endif
    520 
    521 /////////////////////////////////////////////////////////////
    522 /// StackAllocator
    523 
    524 #ifdef HAVE_CUDA
    525 
    526 namespace
    527 {
    528     bool enableMemoryPool = true;
    529 
    530     class StackAllocator : public GpuMat::Allocator
    531     {
    532     public:
    533         explicit StackAllocator(cudaStream_t stream);
    534         ~StackAllocator();
    535 
    536         bool allocate(GpuMat* mat, int rows, int cols, size_t elemSize);
    537         void free(GpuMat* mat);
    538 
    539     private:
    540         StackAllocator(const StackAllocator&);
    541         StackAllocator& operator =(const StackAllocator&);
    542 
    543         cudaStream_t stream_;
    544         MemoryStack* memStack_;
    545         size_t alignment_;
    546     };
    547 
    548     StackAllocator::StackAllocator(cudaStream_t stream) : stream_(stream), memStack_(0)
    549     {
    550         if (enableMemoryPool)
    551         {
    552             const int deviceId = getDevice();
    553             memStack_ = initializer.getMemoryPool(deviceId)->getFreeMemStack();
    554             DeviceInfo devInfo(deviceId);
    555             alignment_ = devInfo.textureAlignment();
    556         }
    557     }
    558 
    559     StackAllocator::~StackAllocator()
    560     {
    561         cudaStreamSynchronize(stream_);
    562 
    563         if (memStack_ != 0)
    564             memStack_->pool->returnMemStack(memStack_);
    565     }
    566 
    567     size_t alignUp(size_t what, size_t alignment)
    568     {
    569         size_t alignMask = alignment-1;
    570         size_t inverseAlignMask = ~alignMask;
    571         size_t res = (what + alignMask) & inverseAlignMask;
    572         return res;
    573     }
    574 
    575     bool StackAllocator::allocate(GpuMat* mat, int rows, int cols, size_t elemSize)
    576     {
    577         if (memStack_ == 0)
    578             return false;
    579 
    580         size_t pitch, memSize;
    581 
    582         if (rows > 1 && cols > 1)
    583         {
    584             pitch = alignUp(cols * elemSize, alignment_);
    585             memSize = pitch * rows;
    586         }
    587         else
    588         {
    589             // Single row or single column must be continuous
    590             pitch = elemSize * cols;
    591             memSize = alignUp(elemSize * cols * rows, 64);
    592         }
    593 
    594         uchar* ptr = memStack_->requestMemory(memSize);
    595 
    596         if (ptr == 0)
    597             return false;
    598 
    599         mat->data = ptr;
    600         mat->step = pitch;
    601         mat->refcount = (int*) fastMalloc(sizeof(int));
    602 
    603         return true;
    604     }
    605 
    606     void StackAllocator::free(GpuMat* mat)
    607     {
    608         if (memStack_ == 0)
    609             return;
    610 
    611         memStack_->returnMemory(mat->datastart);
    612         fastFree(mat->refcount);
    613     }
    614 }
    615 
    616 #endif
    617 
    618 /////////////////////////////////////////////////////////////
    619 /// BufferPool
    620 
    621 void cv::cuda::setBufferPoolUsage(bool on)
    622 {
    623 #ifndef HAVE_CUDA
    624     (void)on;
    625     throw_no_cuda();
    626 #else
    627     enableMemoryPool = on;
    628 #endif
    629 }
    630 
    631 void cv::cuda::setBufferPoolConfig(int deviceId, size_t stackSize, int stackCount)
    632 {
    633 #ifndef HAVE_CUDA
    634     (void)deviceId;
    635     (void)stackSize;
    636     (void)stackCount;
    637     throw_no_cuda();
    638 #else
    639     const int currentDevice = getDevice();
    640 
    641     if (deviceId >= 0)
    642     {
    643         setDevice(deviceId);
    644         initializer.getMemoryPool(deviceId)->initialize(stackSize, stackCount);
    645     }
    646     else
    647     {
    648         const int deviceCount = getCudaEnabledDeviceCount();
    649 
    650         for (deviceId = 0; deviceId < deviceCount; ++deviceId)
    651         {
    652             setDevice(deviceId);
    653             initializer.getMemoryPool(deviceId)->initialize(stackSize, stackCount);
    654         }
    655     }
    656 
    657     setDevice(currentDevice);
    658 #endif
    659 }
    660 
    661 #ifdef HAVE_CUDA
    662 
    663 cv::cuda::BufferPool::BufferPool(Stream& stream) : allocator_(stream.impl_->stackAllocator_.get())
    664 {
    665 }
    666 
    667 GpuMat cv::cuda::BufferPool::getBuffer(int rows, int cols, int type)
    668 {
    669     GpuMat buf(allocator_);
    670     buf.create(rows, cols, type);
    671     return buf;
    672 }
    673 
    674 #endif
    675 
    676 ////////////////////////////////////////////////////////////////
    677 // Event
    678 
    679 #ifndef HAVE_CUDA
    680 
    681 class cv::cuda::Event::Impl
    682 {
    683 public:
    684     Impl(unsigned int)
    685     {
    686         throw_no_cuda();
    687     }
    688 };
    689 
    690 #else
    691 
    692 class cv::cuda::Event::Impl
    693 {
    694 public:
    695     cudaEvent_t event;
    696 
    697     Impl(unsigned int flags);
    698     ~Impl();
    699 };
    700 
    701 cv::cuda::Event::Impl::Impl(unsigned int flags) : event(0)
    702 {
    703     cudaSafeCall( cudaEventCreateWithFlags(&event, flags) );
    704 }
    705 
    706 cv::cuda::Event::Impl::~Impl()
    707 {
    708     if (event)
    709         cudaEventDestroy(event);
    710 }
    711 
    712 cudaEvent_t cv::cuda::EventAccessor::getEvent(const Event& event)
    713 {
    714     return event.impl_->event;
    715 }
    716 
    717 #endif
    718 
    719 cv::cuda::Event::Event(CreateFlags flags)
    720 {
    721 #ifndef HAVE_CUDA
    722     (void) flags;
    723     throw_no_cuda();
    724 #else
    725     impl_ = makePtr<Impl>(flags);
    726 #endif
    727 }
    728 
    729 void cv::cuda::Event::record(Stream& stream)
    730 {
    731 #ifndef HAVE_CUDA
    732     (void) stream;
    733     throw_no_cuda();
    734 #else
    735     cudaSafeCall( cudaEventRecord(impl_->event, StreamAccessor::getStream(stream)) );
    736 #endif
    737 }
    738 
    739 bool cv::cuda::Event::queryIfComplete() const
    740 {
    741 #ifndef HAVE_CUDA
    742     throw_no_cuda();
    743     return false;
    744 #else
    745     cudaError_t err = cudaEventQuery(impl_->event);
    746 
    747     if (err == cudaErrorNotReady || err == cudaSuccess)
    748         return err == cudaSuccess;
    749 
    750     cudaSafeCall(err);
    751     return false;
    752 #endif
    753 }
    754 
    755 void cv::cuda::Event::waitForCompletion()
    756 {
    757 #ifndef HAVE_CUDA
    758     throw_no_cuda();
    759 #else
    760     cudaSafeCall( cudaEventSynchronize(impl_->event) );
    761 #endif
    762 }
    763 
    764 float cv::cuda::Event::elapsedTime(const Event& start, const Event& end)
    765 {
    766 #ifndef HAVE_CUDA
    767     (void) start;
    768     (void) end;
    769     throw_no_cuda();
    770     return 0.0f;
    771 #else
    772     float ms;
    773     cudaSafeCall( cudaEventElapsedTime(&ms, start.impl_->event, end.impl_->event) );
    774     return ms;
    775 #endif
    776 }
    777