Home | History | Annotate | Download | only in cuda
      1 /*M///////////////////////////////////////////////////////////////////////////////////////
      2 //
      3 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
      4 //
      5 //  By downloading, copying, installing or using the software you agree to this license.
      6 //  If you do not agree to this license, do not download, install,
      7 //  copy or use the software.
      8 //
      9 //
     10 //                           License Agreement
     11 //                For Open Source Computer Vision Library
     12 //
     13 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
     14 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
     15 // Third party copyrights are property of their respective owners.
     16 //
     17 // Redistribution and use in source and binary forms, with or without modification,
     18 // are permitted provided that the following conditions are met:
     19 //
     20 //   * Redistribution's of source code must retain the above copyright notice,
     21 //     this list of conditions and the following disclaimer.
     22 //
     23 //   * Redistribution's in binary form must reproduce the above copyright notice,
     24 //     this list of conditions and the following disclaimer in the documentation
     25 //     and/or other materials provided with the distribution.
     26 //
     27 //   * The name of the copyright holders may not be used to endorse or promote products
     28 //     derived from this software without specific prior written permission.
     29 //
     30 // This software is provided by the copyright holders and contributors "as is" and
     31 // any express or implied warranties, including, but not limited to, the implied
     32 // warranties of merchantability and fitness for a particular purpose are disclaimed.
     33 // In no event shall the Intel Corporation or contributors be liable for any direct,
     34 // indirect, incidental, special, exemplary, or consequential damages
     35 // (including, but not limited to, procurement of substitute goods or services;
     36 // loss of use, data, or profits; or business interruption) however caused
     37 // and on any theory of liability, whether in contract, strict liability,
     38 // or tort (including negligence or otherwise) arising in any way out of
     39 // the use of this software, even if advised of the possibility of such damage.
     40 //
     41 //M*/
     42 
     43 #include <stdio.h>
     44 #include <cuda_runtime.h>
     45 
     46 #include "opencv2/core/cuda/common.hpp"
     47 
     48 #include "opencv2/cudalegacy/NCV.hpp"
     49 #include "opencv2/cudalegacy/NCVPyramid.hpp"
     50 
     51 #include "NCVAlg.hpp"
     52 #include "NCVPixelOperations.hpp"
     53 
     54 template<typename T, Ncv32u CN> struct __average4_CN {static __host__ __device__ T _average4_CN(const T &p00, const T &p01, const T &p10, const T &p11);};
     55 
     56 template<typename T> struct __average4_CN<T, 1> {
     57 static __host__ __device__ T _average4_CN(const T &p00, const T &p01, const T &p10, const T &p11)
     58 {
     59     T out;
     60     out.x = ((Ncv32s)p00.x + p01.x + p10.x + p11.x + 2) / 4;
     61     return out;
     62 }};
     63 
     64 template<> struct __average4_CN<float1, 1> {
     65 static __host__ __device__ float1 _average4_CN(const float1 &p00, const float1 &p01, const float1 &p10, const float1 &p11)
     66 {
     67     float1 out;
     68     out.x = (p00.x + p01.x + p10.x + p11.x) / 4;
     69     return out;
     70 }};
     71 
     72 template<> struct __average4_CN<double1, 1> {
     73 static __host__ __device__ double1 _average4_CN(const double1 &p00, const double1 &p01, const double1 &p10, const double1 &p11)
     74 {
     75     double1 out;
     76     out.x = (p00.x + p01.x + p10.x + p11.x) / 4;
     77     return out;
     78 }};
     79 
     80 template<typename T> struct __average4_CN<T, 3> {
     81 static __host__ __device__ T _average4_CN(const T &p00, const T &p01, const T &p10, const T &p11)
     82 {
     83     T out;
     84     out.x = ((Ncv32s)p00.x + p01.x + p10.x + p11.x + 2) / 4;
     85     out.y = ((Ncv32s)p00.y + p01.y + p10.y + p11.y + 2) / 4;
     86     out.z = ((Ncv32s)p00.z + p01.z + p10.z + p11.z + 2) / 4;
     87     return out;
     88 }};
     89 
     90 template<> struct __average4_CN<float3, 3> {
     91 static __host__ __device__ float3 _average4_CN(const float3 &p00, const float3 &p01, const float3 &p10, const float3 &p11)
     92 {
     93     float3 out;
     94     out.x = (p00.x + p01.x + p10.x + p11.x) / 4;
     95     out.y = (p00.y + p01.y + p10.y + p11.y) / 4;
     96     out.z = (p00.z + p01.z + p10.z + p11.z) / 4;
     97     return out;
     98 }};
     99 
    100 template<> struct __average4_CN<double3, 3> {
    101 static __host__ __device__ double3 _average4_CN(const double3 &p00, const double3 &p01, const double3 &p10, const double3 &p11)
    102 {
    103     double3 out;
    104     out.x = (p00.x + p01.x + p10.x + p11.x) / 4;
    105     out.y = (p00.y + p01.y + p10.y + p11.y) / 4;
    106     out.z = (p00.z + p01.z + p10.z + p11.z) / 4;
    107     return out;
    108 }};
    109 
    110 template<typename T> struct __average4_CN<T, 4> {
    111 static __host__ __device__ T _average4_CN(const T &p00, const T &p01, const T &p10, const T &p11)
    112 {
    113     T out;
    114     out.x = ((Ncv32s)p00.x + p01.x + p10.x + p11.x + 2) / 4;
    115     out.y = ((Ncv32s)p00.y + p01.y + p10.y + p11.y + 2) / 4;
    116     out.z = ((Ncv32s)p00.z + p01.z + p10.z + p11.z + 2) / 4;
    117     out.w = ((Ncv32s)p00.w + p01.w + p10.w + p11.w + 2) / 4;
    118     return out;
    119 }};
    120 
    121 template<> struct __average4_CN<float4, 4> {
    122 static __host__ __device__ float4 _average4_CN(const float4 &p00, const float4 &p01, const float4 &p10, const float4 &p11)
    123 {
    124     float4 out;
    125     out.x = (p00.x + p01.x + p10.x + p11.x) / 4;
    126     out.y = (p00.y + p01.y + p10.y + p11.y) / 4;
    127     out.z = (p00.z + p01.z + p10.z + p11.z) / 4;
    128     out.w = (p00.w + p01.w + p10.w + p11.w) / 4;
    129     return out;
    130 }};
    131 
    132 template<> struct __average4_CN<double4, 4> {
    133 static __host__ __device__ double4 _average4_CN(const double4 &p00, const double4 &p01, const double4 &p10, const double4 &p11)
    134 {
    135     double4 out;
    136     out.x = (p00.x + p01.x + p10.x + p11.x) / 4;
    137     out.y = (p00.y + p01.y + p10.y + p11.y) / 4;
    138     out.z = (p00.z + p01.z + p10.z + p11.z) / 4;
    139     out.w = (p00.w + p01.w + p10.w + p11.w) / 4;
    140     return out;
    141 }};
    142 
    143 template<typename T> static __host__ __device__ T _average4(const T &p00, const T &p01, const T &p10, const T &p11)
    144 {
    145     return __average4_CN<T, NC(T)>::_average4_CN(p00, p01, p10, p11);
    146 }
    147 
    148 
    149 template<typename Tin, typename Tout, Ncv32u CN> struct __lerp_CN {static __host__ __device__ Tout _lerp_CN(const Tin &a, const Tin &b, Ncv32f d);};
    150 
    151 template<typename Tin, typename Tout> struct __lerp_CN<Tin, Tout, 1> {
    152 static __host__ __device__ Tout _lerp_CN(const Tin &a, const Tin &b, Ncv32f d)
    153 {
    154     typedef typename TConvVec2Base<Tout>::TBase TB;
    155     return _pixMake(TB(b.x * d + a.x * (1 - d)));
    156 }};
    157 
    158 template<typename Tin, typename Tout> struct __lerp_CN<Tin, Tout, 3> {
    159 static __host__ __device__ Tout _lerp_CN(const Tin &a, const Tin &b, Ncv32f d)
    160 {
    161     typedef typename TConvVec2Base<Tout>::TBase TB;
    162     return _pixMake(TB(b.x * d + a.x * (1 - d)),
    163                     TB(b.y * d + a.y * (1 - d)),
    164                     TB(b.z * d + a.z * (1 - d)));
    165 }};
    166 
    167 template<typename Tin, typename Tout> struct __lerp_CN<Tin, Tout, 4> {
    168 static __host__ __device__ Tout _lerp_CN(const Tin &a, const Tin &b, Ncv32f d)
    169 {
    170     typedef typename TConvVec2Base<Tout>::TBase TB;
    171     return _pixMake(TB(b.x * d + a.x * (1 - d)),
    172                     TB(b.y * d + a.y * (1 - d)),
    173                     TB(b.z * d + a.z * (1 - d)),
    174                     TB(b.w * d + a.w * (1 - d)));
    175 }};
    176 
    177 template<typename Tin, typename Tout> static __host__ __device__ Tout _lerp(const Tin &a, const Tin &b, Ncv32f d)
    178 {
    179     return __lerp_CN<Tin, Tout, NC(Tin)>::_lerp_CN(a, b, d);
    180 }
    181 
    182 
    183 template<typename T>
    184 __global__ void kernelDownsampleX2(T *d_src,
    185                                    Ncv32u srcPitch,
    186                                    T *d_dst,
    187                                    Ncv32u dstPitch,
    188                                    NcvSize32u dstRoi)
    189 {
    190     Ncv32u i = blockIdx.y * blockDim.y + threadIdx.y;
    191     Ncv32u j = blockIdx.x * blockDim.x + threadIdx.x;
    192 
    193     if (i < dstRoi.height && j < dstRoi.width)
    194     {
    195         T *d_src_line1 = (T *)((Ncv8u *)d_src + (2 * i + 0) * srcPitch);
    196         T *d_src_line2 = (T *)((Ncv8u *)d_src + (2 * i + 1) * srcPitch);
    197         T *d_dst_line = (T *)((Ncv8u *)d_dst + i * dstPitch);
    198 
    199         T p00 = d_src_line1[2*j+0];
    200         T p01 = d_src_line1[2*j+1];
    201         T p10 = d_src_line2[2*j+0];
    202         T p11 = d_src_line2[2*j+1];
    203 
    204         d_dst_line[j] = _average4(p00, p01, p10, p11);
    205     }
    206 }
    207 
    208 namespace cv { namespace cuda { namespace device
    209 {
    210     namespace pyramid
    211     {
    212         template <typename T> void kernelDownsampleX2_gpu(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream)
    213         {
    214             dim3 bDim(16, 8);
    215             dim3 gDim(divUp(src.cols, bDim.x), divUp(src.rows, bDim.y));
    216 
    217             kernelDownsampleX2<<<gDim, bDim, 0, stream>>>((T*)src.data, static_cast<Ncv32u>(src.step),
    218                 (T*)dst.data, static_cast<Ncv32u>(dst.step), NcvSize32u(dst.cols, dst.rows));
    219 
    220             cudaSafeCall( cudaGetLastError() );
    221 
    222             if (stream == 0)
    223                 cudaSafeCall( cudaDeviceSynchronize() );
    224         }
    225 
    226         void downsampleX2(PtrStepSzb src, PtrStepSzb dst, int depth, int cn, cudaStream_t stream)
    227         {
    228             typedef void (*func_t)(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
    229 
    230             static const func_t funcs[6][4] =
    231             {
    232                 {kernelDownsampleX2_gpu<uchar1>       , 0 /*kernelDownsampleX2_gpu<uchar2>*/ , kernelDownsampleX2_gpu<uchar3>      , kernelDownsampleX2_gpu<uchar4>      },
    233                 {0 /*kernelDownsampleX2_gpu<char1>*/  , 0 /*kernelDownsampleX2_gpu<char2>*/  , 0 /*kernelDownsampleX2_gpu<char3>*/ , 0 /*kernelDownsampleX2_gpu<char4>*/ },
    234                 {kernelDownsampleX2_gpu<ushort1>      , 0 /*kernelDownsampleX2_gpu<ushort2>*/, kernelDownsampleX2_gpu<ushort3>     , kernelDownsampleX2_gpu<ushort4>     },
    235                 {0 /*kernelDownsampleX2_gpu<short1>*/ , 0 /*kernelDownsampleX2_gpu<short2>*/ , 0 /*kernelDownsampleX2_gpu<short3>*/, 0 /*kernelDownsampleX2_gpu<short4>*/},
    236                 {0 /*kernelDownsampleX2_gpu<int1>*/   , 0 /*kernelDownsampleX2_gpu<int2>*/   , 0 /*kernelDownsampleX2_gpu<int3>*/  , 0 /*kernelDownsampleX2_gpu<int4>*/  },
    237                 {kernelDownsampleX2_gpu<float1>       , 0 /*kernelDownsampleX2_gpu<float2>*/ , kernelDownsampleX2_gpu<float3>      , kernelDownsampleX2_gpu<float4>      }
    238             };
    239 
    240             const func_t func = funcs[depth][cn - 1];
    241             CV_Assert(func != 0);
    242 
    243             func(src, dst, stream);
    244         }
    245     }
    246 }}}
    247 
    248 
    249 
    250 
    251 template<typename T>
    252 __global__ void kernelInterpolateFrom1(T *d_srcTop,
    253                                        Ncv32u srcTopPitch,
    254                                        NcvSize32u szTopRoi,
    255                                        T *d_dst,
    256                                        Ncv32u dstPitch,
    257                                        NcvSize32u dstRoi)
    258 {
    259     Ncv32u i = blockIdx.y * blockDim.y + threadIdx.y;
    260     Ncv32u j = blockIdx.x * blockDim.x + threadIdx.x;
    261 
    262     if (i < dstRoi.height && j < dstRoi.width)
    263     {
    264         Ncv32f ptTopX = 1.0f * (szTopRoi.width - 1) * j / (dstRoi.width - 1);
    265         Ncv32f ptTopY = 1.0f * (szTopRoi.height - 1) * i / (dstRoi.height - 1);
    266         Ncv32u xl = (Ncv32u)ptTopX;
    267         Ncv32u xh = xl+1;
    268         Ncv32f dx = ptTopX - xl;
    269         Ncv32u yl = (Ncv32u)ptTopY;
    270         Ncv32u yh = yl+1;
    271         Ncv32f dy = ptTopY - yl;
    272 
    273         T *d_src_line1 = (T *)((Ncv8u *)d_srcTop + yl * srcTopPitch);
    274         T *d_src_line2 = (T *)((Ncv8u *)d_srcTop + yh * srcTopPitch);
    275         T *d_dst_line = (T *)((Ncv8u *)d_dst + i * dstPitch);
    276 
    277         T p00, p01, p10, p11;
    278         p00 = d_src_line1[xl];
    279         p01 = xh < szTopRoi.width ? d_src_line1[xh] : p00;
    280         p10 = yh < szTopRoi.height ? d_src_line2[xl] : p00;
    281         p11 = (xh < szTopRoi.width && yh < szTopRoi.height) ? d_src_line2[xh] : p00;
    282         typedef typename TConvBase2Vec<Ncv32f, NC(T)>::TVec TVFlt;
    283         TVFlt m_00_01 = _lerp<T, TVFlt>(p00, p01, dx);
    284         TVFlt m_10_11 = _lerp<T, TVFlt>(p10, p11, dx);
    285         TVFlt mixture = _lerp<TVFlt, TVFlt>(m_00_01, m_10_11, dy);
    286         T outPix = _pixDemoteClampZ<TVFlt, T>(mixture);
    287 
    288         d_dst_line[j] = outPix;
    289     }
    290 }
    291 namespace cv { namespace cuda { namespace device
    292 {
    293     namespace pyramid
    294     {
    295         template <typename T> void kernelInterpolateFrom1_gpu(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream)
    296         {
    297             dim3 bDim(16, 8);
    298             dim3 gDim(divUp(dst.cols, bDim.x), divUp(dst.rows, bDim.y));
    299 
    300             kernelInterpolateFrom1<<<gDim, bDim, 0, stream>>>((T*) src.data, static_cast<Ncv32u>(src.step), NcvSize32u(src.cols, src.rows),
    301                 (T*) dst.data, static_cast<Ncv32u>(dst.step), NcvSize32u(dst.cols, dst.rows));
    302 
    303             cudaSafeCall( cudaGetLastError() );
    304 
    305             if (stream == 0)
    306                 cudaSafeCall( cudaDeviceSynchronize() );
    307         }
    308 
    309         void interpolateFrom1(PtrStepSzb src, PtrStepSzb dst, int depth, int cn, cudaStream_t stream)
    310         {
    311             typedef void (*func_t)(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
    312 
    313             static const func_t funcs[6][4] =
    314             {
    315                 {kernelInterpolateFrom1_gpu<uchar1>      , 0 /*kernelInterpolateFrom1_gpu<uchar2>*/ , kernelInterpolateFrom1_gpu<uchar3>      , kernelInterpolateFrom1_gpu<uchar4>      },
    316                 {0 /*kernelInterpolateFrom1_gpu<char1>*/ , 0 /*kernelInterpolateFrom1_gpu<char2>*/  , 0 /*kernelInterpolateFrom1_gpu<char3>*/ , 0 /*kernelInterpolateFrom1_gpu<char4>*/ },
    317                 {kernelInterpolateFrom1_gpu<ushort1>     , 0 /*kernelInterpolateFrom1_gpu<ushort2>*/, kernelInterpolateFrom1_gpu<ushort3>     , kernelInterpolateFrom1_gpu<ushort4>     },
    318                 {0 /*kernelInterpolateFrom1_gpu<short1>*/, 0 /*kernelInterpolateFrom1_gpu<short2>*/ , 0 /*kernelInterpolateFrom1_gpu<short3>*/, 0 /*kernelInterpolateFrom1_gpu<short4>*/},
    319                 {0 /*kernelInterpolateFrom1_gpu<int1>*/  , 0 /*kernelInterpolateFrom1_gpu<int2>*/   , 0 /*kernelInterpolateFrom1_gpu<int3>*/  , 0 /*kernelInterpolateFrom1_gpu<int4>*/  },
    320                 {kernelInterpolateFrom1_gpu<float1>      , 0 /*kernelInterpolateFrom1_gpu<float2>*/ , kernelInterpolateFrom1_gpu<float3>      , kernelInterpolateFrom1_gpu<float4>      }
    321             };
    322 
    323             const func_t func = funcs[depth][cn - 1];
    324             CV_Assert(func != 0);
    325 
    326             func(src, dst, stream);
    327         }
    328     }
    329 }}}
    330 
    331 
    332 #if 0 //def _WIN32
    333 
    334 template<typename T>
    335 static T _interpLinear(const T &a, const T &b, Ncv32f d)
    336 {
    337     typedef typename TConvBase2Vec<Ncv32f, NC(T)>::TVec TVFlt;
    338     TVFlt tmp = _lerp<T, TVFlt>(a, b, d);
    339     return _pixDemoteClampZ<TVFlt, T>(tmp);
    340 }
    341 
    342 
    343 template<typename T>
    344 static T _interpBilinear(const NCVMatrix<T> &refLayer, Ncv32f x, Ncv32f y)
    345 {
    346     Ncv32u xl = (Ncv32u)x;
    347     Ncv32u xh = xl+1;
    348     Ncv32f dx = x - xl;
    349     Ncv32u yl = (Ncv32u)y;
    350     Ncv32u yh = yl+1;
    351     Ncv32f dy = y - yl;
    352     T p00, p01, p10, p11;
    353     p00 = refLayer.at(xl, yl);
    354     p01 = xh < refLayer.width() ? refLayer.at(xh, yl) : p00;
    355     p10 = yh < refLayer.height() ? refLayer.at(xl, yh) : p00;
    356     p11 = (xh < refLayer.width() && yh < refLayer.height()) ? refLayer.at(xh, yh) : p00;
    357     typedef typename TConvBase2Vec<Ncv32f, NC(T)>::TVec TVFlt;
    358     TVFlt m_00_01 = _lerp<T, TVFlt>(p00, p01, dx);
    359     TVFlt m_10_11 = _lerp<T, TVFlt>(p10, p11, dx);
    360     TVFlt mixture = _lerp<TVFlt, TVFlt>(m_00_01, m_10_11, dy);
    361     return _pixDemoteClampZ<TVFlt, T>(mixture);
    362 }
    363 
    364 template <class T>
    365 NCVImagePyramid<T>::NCVImagePyramid(const NCVMatrix<T> &img,
    366                                     Ncv8u numLayers,
    367                                     INCVMemAllocator &alloc,
    368                                     cudaStream_t cuStream)
    369 {
    370     this->_isInitialized = false;
    371     ncvAssertPrintReturn(img.memType() == alloc.memType(), "NCVImagePyramid::ctor error", );
    372 
    373     this->layer0 = &img;
    374     NcvSize32u szLastLayer(img.width(), img.height());
    375     this->nLayers = 1;
    376 
    377     NCV_SET_SKIP_COND(alloc.isCounting());
    378     NcvBool bDeviceCode = alloc.memType() == NCVMemoryTypeDevice;
    379 
    380     if (numLayers == 0)
    381     {
    382         numLayers = 255; //it will cut-off when any of the dimensions goes 1
    383     }
    384 
    385 #ifdef SELF_CHECK_GPU
    386     NCVMemNativeAllocator allocCPU(NCVMemoryTypeHostPinned, 512);
    387 #endif
    388 
    389     for (Ncv32u i=0; i<(Ncv32u)numLayers-1; i++)
    390     {
    391         NcvSize32u szCurLayer(szLastLayer.width / 2, szLastLayer.height / 2);
    392         if (szCurLayer.width == 0 || szCurLayer.height == 0)
    393         {
    394             break;
    395         }
    396 
    397         this->pyramid.push_back(new NCVMatrixAlloc<T>(alloc, szCurLayer.width, szCurLayer.height));
    398         ncvAssertPrintReturn(((NCVMatrixAlloc<T> *)(this->pyramid[i]))->isMemAllocated(), "NCVImagePyramid::ctor error", );
    399         this->nLayers++;
    400 
    401         //fill in the layer
    402         NCV_SKIP_COND_BEGIN
    403 
    404         const NCVMatrix<T> *prevLayer = i == 0 ? this->layer0 : this->pyramid[i-1];
    405         NCVMatrix<T> *curLayer = this->pyramid[i];
    406 
    407         if (bDeviceCode)
    408         {
    409             dim3 bDim(16, 8);
    410             dim3 gDim(divUp(szCurLayer.width, bDim.x), divUp(szCurLayer.height, bDim.y));
    411             kernelDownsampleX2<<<gDim, bDim, 0, cuStream>>>(prevLayer->ptr(),
    412                                                             prevLayer->pitch(),
    413                                                             curLayer->ptr(),
    414                                                             curLayer->pitch(),
    415                                                             szCurLayer);
    416             ncvAssertPrintReturn(cudaSuccess == cudaGetLastError(), "NCVImagePyramid::ctor error", );
    417 
    418 #ifdef SELF_CHECK_GPU
    419             NCVMatrixAlloc<T> h_prevLayer(allocCPU, prevLayer->width(), prevLayer->height());
    420             ncvAssertPrintReturn(h_prevLayer.isMemAllocated(), "Validation failure in NCVImagePyramid::ctor", );
    421             NCVMatrixAlloc<T> h_curLayer(allocCPU, curLayer->width(), curLayer->height());
    422             ncvAssertPrintReturn(h_curLayer.isMemAllocated(), "Validation failure in NCVImagePyramid::ctor", );
    423             ncvAssertPrintReturn(NCV_SUCCESS == prevLayer->copy2D(h_prevLayer, prevLayer->size(), cuStream), "Validation failure in NCVImagePyramid::ctor", );
    424             ncvAssertPrintReturn(NCV_SUCCESS == curLayer->copy2D(h_curLayer, curLayer->size(), cuStream), "Validation failure in NCVImagePyramid::ctor", );
    425             ncvAssertPrintReturn(cudaSuccess == cudaStreamSynchronize(cuStream), "Validation failure in NCVImagePyramid::ctor", );
    426             for (Ncv32u i=0; i<szCurLayer.height; i++)
    427             {
    428                 for (Ncv32u j=0; j<szCurLayer.width; j++)
    429                 {
    430                     T p00 = h_prevLayer.at(2*j+0, 2*i+0);
    431                     T p01 = h_prevLayer.at(2*j+1, 2*i+0);
    432                     T p10 = h_prevLayer.at(2*j+0, 2*i+1);
    433                     T p11 = h_prevLayer.at(2*j+1, 2*i+1);
    434                     T outGold = _average4(p00, p01, p10, p11);
    435                     T outGPU = h_curLayer.at(j, i);
    436                     ncvAssertPrintReturn(0 == memcmp(&outGold, &outGPU, sizeof(T)), "Validation failure in NCVImagePyramid::ctor with kernelDownsampleX2", );
    437                 }
    438             }
    439 #endif
    440         }
    441         else
    442         {
    443             for (Ncv32u i=0; i<szCurLayer.height; i++)
    444             {
    445                 for (Ncv32u j=0; j<szCurLayer.width; j++)
    446                 {
    447                     T p00 = prevLayer->at(2*j+0, 2*i+0);
    448                     T p01 = prevLayer->at(2*j+1, 2*i+0);
    449                     T p10 = prevLayer->at(2*j+0, 2*i+1);
    450                     T p11 = prevLayer->at(2*j+1, 2*i+1);
    451                     curLayer->at(j, i) = _average4(p00, p01, p10, p11);
    452                 }
    453             }
    454         }
    455 
    456         NCV_SKIP_COND_END
    457 
    458         szLastLayer = szCurLayer;
    459     }
    460 
    461     this->_isInitialized = true;
    462 }
    463 
    464 
    465 template <class T>
    466 NCVImagePyramid<T>::~NCVImagePyramid()
    467 {
    468 }
    469 
    470 
    471 template <class T>
    472 NcvBool NCVImagePyramid<T>::isInitialized() const
    473 {
    474     return this->_isInitialized;
    475 }
    476 
    477 
    478 template <class T>
    479 NCVStatus NCVImagePyramid<T>::getLayer(NCVMatrix<T> &outImg,
    480                                        NcvSize32u outRoi,
    481                                        NcvBool bTrilinear,
    482                                        cudaStream_t cuStream) const
    483 {
    484     ncvAssertReturn(this->isInitialized(), NCV_UNKNOWN_ERROR);
    485     ncvAssertReturn(outImg.memType() == this->layer0->memType(), NCV_MEM_RESIDENCE_ERROR);
    486     ncvAssertReturn(outRoi.width <= this->layer0->width() && outRoi.height <= this->layer0->height() &&
    487                     outRoi.width > 0 && outRoi.height > 0, NCV_DIMENSIONS_INVALID);
    488 
    489     if (outRoi.width == this->layer0->width() && outRoi.height == this->layer0->height())
    490     {
    491         ncvAssertReturnNcvStat(this->layer0->copy2D(outImg, NcvSize32u(this->layer0->width(), this->layer0->height()), cuStream));
    492         return NCV_SUCCESS;
    493     }
    494 
    495     Ncv32f lastScale = 1.0f;
    496     Ncv32f curScale;
    497     const NCVMatrix<T> *lastLayer = this->layer0;
    498     const NCVMatrix<T> *curLayer = NULL;
    499     NcvBool bUse2Refs = false;
    500 
    501     for (Ncv32u i=0; i<this->nLayers-1; i++)
    502     {
    503         curScale = lastScale * 0.5f;
    504         curLayer = this->pyramid[i];
    505 
    506         if (outRoi.width == curLayer->width() && outRoi.height == curLayer->height())
    507         {
    508             ncvAssertReturnNcvStat(this->pyramid[i]->copy2D(outImg, NcvSize32u(this->pyramid[i]->width(), this->pyramid[i]->height()), cuStream));
    509             return NCV_SUCCESS;
    510         }
    511 
    512         if (outRoi.width >= curLayer->width() && outRoi.height >= curLayer->height())
    513         {
    514             if (outRoi.width < lastLayer->width() && outRoi.height < lastLayer->height())
    515             {
    516                 bUse2Refs = true;
    517             }
    518             break;
    519         }
    520 
    521         lastScale = curScale;
    522         lastLayer = curLayer;
    523     }
    524 
    525     bUse2Refs = bUse2Refs && bTrilinear;
    526 
    527     NCV_SET_SKIP_COND(outImg.memType() == NCVMemoryTypeNone);
    528     NcvBool bDeviceCode = this->layer0->memType() == NCVMemoryTypeDevice;
    529 
    530 #ifdef SELF_CHECK_GPU
    531     NCVMemNativeAllocator allocCPU(NCVMemoryTypeHostPinned, 512);
    532 #endif
    533 
    534     NCV_SKIP_COND_BEGIN
    535 
    536     if (bDeviceCode)
    537     {
    538         ncvAssertReturn(bUse2Refs == false, NCV_NOT_IMPLEMENTED);
    539 
    540         dim3 bDim(16, 8);
    541         dim3 gDim(divUp(outRoi.width, bDim.x), divUp(outRoi.height, bDim.y));
    542         kernelInterpolateFrom1<<<gDim, bDim, 0, cuStream>>>(lastLayer->ptr(),
    543                                                             lastLayer->pitch(),
    544                                                             lastLayer->size(),
    545                                                             outImg.ptr(),
    546                                                             outImg.pitch(),
    547                                                             outRoi);
    548         ncvAssertCUDAReturn(cudaGetLastError(), NCV_CUDA_ERROR);
    549 
    550 #ifdef SELF_CHECK_GPU
    551         ncvSafeMatAlloc(h_lastLayer, T, allocCPU, lastLayer->width(), lastLayer->height(), NCV_ALLOCATOR_BAD_ALLOC);
    552         ncvSafeMatAlloc(h_outImg, T, allocCPU, outImg.width(), outImg.height(), NCV_ALLOCATOR_BAD_ALLOC);
    553         ncvAssertReturnNcvStat(lastLayer->copy2D(h_lastLayer, lastLayer->size(), cuStream));
    554         ncvAssertReturnNcvStat(outImg.copy2D(h_outImg, outRoi, cuStream));
    555         ncvAssertCUDAReturn(cudaStreamSynchronize(cuStream), NCV_CUDA_ERROR);
    556 
    557         for (Ncv32u i=0; i<outRoi.height; i++)
    558         {
    559             for (Ncv32u j=0; j<outRoi.width; j++)
    560             {
    561                 NcvSize32u szTopLayer(lastLayer->width(), lastLayer->height());
    562                 Ncv32f ptTopX = 1.0f * (szTopLayer.width - 1) * j / (outRoi.width - 1);
    563                 Ncv32f ptTopY = 1.0f * (szTopLayer.height - 1) * i / (outRoi.height - 1);
    564                 T outGold = _interpBilinear(h_lastLayer, ptTopX, ptTopY);
    565                 ncvAssertPrintReturn(0 == memcmp(&outGold, &h_outImg.at(j,i), sizeof(T)), "Validation failure in NCVImagePyramid::ctor with kernelInterpolateFrom1", NCV_UNKNOWN_ERROR);
    566             }
    567         }
    568 #endif
    569     }
    570     else
    571     {
    572         for (Ncv32u i=0; i<outRoi.height; i++)
    573         {
    574             for (Ncv32u j=0; j<outRoi.width; j++)
    575             {
    576                 //top layer pixel (always exists)
    577                 NcvSize32u szTopLayer(lastLayer->width(), lastLayer->height());
    578                 Ncv32f ptTopX = 1.0f * (szTopLayer.width - 1) * j / (outRoi.width - 1);
    579                 Ncv32f ptTopY = 1.0f * (szTopLayer.height - 1) * i / (outRoi.height - 1);
    580                 T topPix = _interpBilinear(*lastLayer, ptTopX, ptTopY);
    581                 T trilinearPix = topPix;
    582 
    583                 if (bUse2Refs)
    584                 {
    585                     //bottom layer pixel (exists only if the requested scale is greater than the smallest layer scale)
    586                     NcvSize32u szBottomLayer(curLayer->width(), curLayer->height());
    587                     Ncv32f ptBottomX = 1.0f * (szBottomLayer.width - 1) * j / (outRoi.width - 1);
    588                     Ncv32f ptBottomY = 1.0f * (szBottomLayer.height - 1) * i / (outRoi.height - 1);
    589                     T bottomPix = _interpBilinear(*curLayer, ptBottomX, ptBottomY);
    590 
    591                     Ncv32f scale = (1.0f * outRoi.width / layer0->width() + 1.0f * outRoi.height / layer0->height()) / 2;
    592                     Ncv32f dl = (scale - curScale) / (lastScale - curScale);
    593                     dl = CLAMP(dl, 0.0f, 1.0f);
    594                     trilinearPix = _interpLinear(bottomPix, topPix, dl);
    595                 }
    596 
    597                 outImg.at(j, i) = trilinearPix;
    598             }
    599         }
    600     }
    601 
    602     NCV_SKIP_COND_END
    603 
    604     return NCV_SUCCESS;
    605 }
    606 
    607 
    608 template class NCVImagePyramid<uchar1>;
    609 template class NCVImagePyramid<uchar3>;
    610 template class NCVImagePyramid<uchar4>;
    611 template class NCVImagePyramid<ushort1>;
    612 template class NCVImagePyramid<ushort3>;
    613 template class NCVImagePyramid<ushort4>;
    614 template class NCVImagePyramid<uint1>;
    615 template class NCVImagePyramid<uint3>;
    616 template class NCVImagePyramid<uint4>;
    617 template class NCVImagePyramid<float1>;
    618 template class NCVImagePyramid<float3>;
    619 template class NCVImagePyramid<float4>;
    620 
    621 #endif //_WIN32
    622