Home | History | Annotate | Download | only in detail
      1 /*M///////////////////////////////////////////////////////////////////////////////////////
      2 //
      3 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
      4 //
      5 //  By downloading, copying, installing or using the software you agree to this license.
      6 //  If you do not agree to this license, do not download, install,
      7 //  copy or use the software.
      8 //
      9 //
     10 //                          License Agreement
     11 //                For Open Source Computer Vision Library
     12 //
     13 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
     14 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
     15 // Copyright (C) 2013, OpenCV Foundation, all rights reserved.
     16 // Third party copyrights are property of their respective owners.
     17 //
     18 // Redistribution and use in source and binary forms, with or without modification,
     19 // are permitted provided that the following conditions are met:
     20 //
     21 //   * Redistribution's of source code must retain the above copyright notice,
     22 //     this list of conditions and the following disclaimer.
     23 //
     24 //   * Redistribution's in binary form must reproduce the above copyright notice,
     25 //     this list of conditions and the following disclaimer in the documentation
     26 //     and/or other materials provided with the distribution.
     27 //
     28 //   * The name of the copyright holders may not be used to endorse or promote products
     29 //     derived from this software without specific prior written permission.
     30 //
     31 // This software is provided by the copyright holders and contributors "as is" and
     32 // any express or implied warranties, including, but not limited to, the implied
     33 // warranties of merchantability and fitness for a particular purpose are disclaimed.
     34 // In no event shall the Intel Corporation or contributors be liable for any direct,
     35 // indirect, incidental, special, exemplary, or consequential damages
     36 // (including, but not limited to, procurement of substitute goods or services;
     37 // loss of use, data, or profits; or business interruption) however caused
     38 // and on any theory of liability, whether in contract, strict liability,
     39 // or tort (including negligence or otherwise) arising in any way out of
     40 // the use of this software, even if advised of the possibility of such damage.
     41 //
     42 //M*/
     43 
     44 #pragma once
     45 
     46 #ifndef __OPENCV_CUDEV_GRID_INTEGRAL_DETAIL_HPP__
     47 #define __OPENCV_CUDEV_GRID_INTEGRAL_DETAIL_HPP__
     48 
     49 #include "../../common.hpp"
     50 #include "../../warp/shuffle.hpp"
     51 #include "../../block/scan.hpp"
     52 #include "../../ptr2d/glob.hpp"
     53 
     54 namespace cv { namespace cudev {
     55 
     56 namespace integral_detail
     57 {
     58     // horizontal_pass
     59 
     60     template <int NUM_SCAN_THREADS, class SrcPtr, typename D>
     61     __global__ void horizontal_pass(const SrcPtr src, GlobPtr<D> dst, const int cols)
     62     {
     63         __shared__ D smem[NUM_SCAN_THREADS * 2];
     64         __shared__ D carryElem;
     65 
     66         carryElem = 0;
     67 
     68         __syncthreads();
     69 
     70         D* dst_row = dst.row(blockIdx.x);
     71 
     72         int numBuckets = divUp(cols, NUM_SCAN_THREADS);
     73         int offsetX = 0;
     74 
     75         while (numBuckets--)
     76         {
     77             const int curElemOffs = offsetX + threadIdx.x;
     78 
     79             D curElem = 0.0f;
     80 
     81             if (curElemOffs < cols)
     82                 curElem = src(blockIdx.x, curElemOffs);
     83 
     84             const D curScanElem = blockScanInclusive<NUM_SCAN_THREADS>(curElem, smem, threadIdx.x);
     85 
     86             if (curElemOffs < cols)
     87                 dst_row[curElemOffs] = carryElem + curScanElem;
     88 
     89             offsetX += NUM_SCAN_THREADS;
     90 
     91             __syncthreads();
     92 
     93             if (threadIdx.x == NUM_SCAN_THREADS - 1)
     94             {
     95                 carryElem += curScanElem;
     96             }
     97 
     98             __syncthreads();
     99         }
    100     }
    101 
    102     template <int NUM_SCAN_THREADS, typename T, typename D>
    103     __global__ void horizontal_pass(const GlobPtr<T> src, GlobPtr<D> dst, const int cols)
    104     {
    105         __shared__ D smem[NUM_SCAN_THREADS * 2];
    106         __shared__ D carryElem;
    107 
    108         carryElem = 0;
    109 
    110         __syncthreads();
    111 
    112         const T* src_row = src.row(blockIdx.x);
    113         D* dst_row = dst.row(blockIdx.x);
    114 
    115         int numBuckets = divUp(cols, NUM_SCAN_THREADS);
    116         int offsetX = 0;
    117 
    118         while (numBuckets--)
    119         {
    120             const int curElemOffs = offsetX + threadIdx.x;
    121 
    122             D curElem = 0.0f;
    123 
    124             if (curElemOffs < cols)
    125                 curElem = src_row[curElemOffs];
    126 
    127             const D curScanElem = blockScanInclusive<NUM_SCAN_THREADS>(curElem, smem, threadIdx.x);
    128 
    129             if (curElemOffs < cols)
    130                 dst_row[curElemOffs] = carryElem + curScanElem;
    131 
    132             offsetX += NUM_SCAN_THREADS;
    133 
    134             __syncthreads();
    135 
    136             if (threadIdx.x == NUM_SCAN_THREADS - 1)
    137             {
    138                 carryElem += curScanElem;
    139             }
    140 
    141             __syncthreads();
    142         }
    143     }
    144 
    145     template <class SrcPtr, typename D>
    146     __host__ void horizontal_pass(const SrcPtr& src, const GlobPtr<D>& dst, int rows, int cols, cudaStream_t stream)
    147     {
    148         const int NUM_SCAN_THREADS = 256;
    149 
    150         const dim3 block(NUM_SCAN_THREADS);
    151         const dim3 grid(rows);
    152 
    153         horizontal_pass<NUM_SCAN_THREADS><<<grid, block, 0, stream>>>(src, dst, cols);
    154         CV_CUDEV_SAFE_CALL( cudaGetLastError() );
    155     }
    156 
    157     // horisontal_pass_8u_shfl
    158 
    159     __device__ static uchar4 int_to_uchar4(unsigned int in)
    160     {
    161         uchar4 bytes;
    162         bytes.x = (in & 0x000000ff) >>  0;
    163         bytes.y = (in & 0x0000ff00) >>  8;
    164         bytes.z = (in & 0x00ff0000) >> 16;
    165         bytes.w = (in & 0xff000000) >> 24;
    166         return bytes;
    167     }
    168 
    169     __global__ static void horisontal_pass_8u_shfl_kernel(const GlobPtr<uint4> img, GlobPtr<uint4> integral)
    170     {
    171     #if CV_CUDEV_ARCH >= 300
    172         __shared__ int sums[128];
    173 
    174         const int id = threadIdx.x;
    175         const int lane_id = id % warpSize;
    176         const int warp_id = id / warpSize;
    177 
    178         const uint4 data = img(blockIdx.x, id);
    179 
    180         const uchar4 a = int_to_uchar4(data.x);
    181         const uchar4 b = int_to_uchar4(data.y);
    182         const uchar4 c = int_to_uchar4(data.z);
    183         const uchar4 d = int_to_uchar4(data.w);
    184 
    185         int result[16];
    186 
    187         result[0]  =              a.x;
    188         result[1]  = result[0]  + a.y;
    189         result[2]  = result[1]  + a.z;
    190         result[3]  = result[2]  + a.w;
    191 
    192         result[4]  = result[3]  + b.x;
    193         result[5]  = result[4]  + b.y;
    194         result[6]  = result[5]  + b.z;
    195         result[7]  = result[6]  + b.w;
    196 
    197         result[8]  = result[7]  + c.x;
    198         result[9]  = result[8]  + c.y;
    199         result[10] = result[9]  + c.z;
    200         result[11] = result[10] + c.w;
    201 
    202         result[12] = result[11] + d.x;
    203         result[13] = result[12] + d.y;
    204         result[14] = result[13] + d.z;
    205         result[15] = result[14] + d.w;
    206 
    207         int sum = result[15];
    208 
    209         // the prefix sum for each thread's 16 value is computed,
    210         // now the final sums (result[15]) need to be shared
    211         // with the other threads and add.  To do this,
    212         // the shfl_up() instruction is used and a shuffle scan
    213         // operation is performed to distribute the sums to the correct
    214         // threads
    215         #pragma unroll
    216         for (int i = 1; i < 32; i *= 2)
    217         {
    218             const int n = shfl_up(sum, i, 32);
    219 
    220             if (lane_id >= i)
    221             {
    222                 #pragma unroll
    223                 for (int k = 0; k < 16; ++k)
    224                     result[k] += n;
    225 
    226                 sum += n;
    227             }
    228         }
    229 
    230         // Now the final sum for the warp must be shared
    231         // between warps.  This is done by each warp
    232         // having a thread store to shared memory, then
    233         // having some other warp load the values and
    234         // compute a prefix sum, again by using shfl_up.
    235         // The results are uniformly added back to the warps.
    236         // last thread in the warp holding sum of the warp
    237         // places that in shared
    238         if (threadIdx.x % warpSize == warpSize - 1)
    239             sums[warp_id] = result[15];
    240 
    241         __syncthreads();
    242 
    243         if (warp_id == 0)
    244         {
    245             int warp_sum = sums[lane_id];
    246 
    247             #pragma unroll
    248             for (int i = 1; i <= 32; i *= 2)
    249             {
    250                 const int n = shfl_up(warp_sum, i, 32);
    251 
    252                 if (lane_id >= i)
    253                     warp_sum += n;
    254             }
    255 
    256             sums[lane_id] = warp_sum;
    257         }
    258 
    259         __syncthreads();
    260 
    261         int blockSum = 0;
    262 
    263         // fold in unused warp
    264         if (warp_id > 0)
    265         {
    266             blockSum = sums[warp_id - 1];
    267 
    268             #pragma unroll
    269             for (int k = 0; k < 16; ++k)
    270                 result[k] += blockSum;
    271         }
    272 
    273         // assemble result
    274         // Each thread has 16 values to write, which are
    275         // now integer data (to avoid overflow).  Instead of
    276         // each thread writing consecutive uint4s, the
    277         // approach shown here experiments using
    278         // the shuffle command to reformat the data
    279         // inside the registers so that each thread holds
    280         // consecutive data to be written so larger contiguous
    281         // segments can be assembled for writing.
    282 
    283         /*
    284             For example data that needs to be written as
    285 
    286             GMEM[16] <- x0 x1 x2 x3 y0 y1 y2 y3 z0 z1 z2 z3 w0 w1 w2 w3
    287             but is stored in registers (r0..r3), in four threads (0..3) as:
    288 
    289             threadId   0  1  2  3
    290               r0      x0 y0 z0 w0
    291               r1      x1 y1 z1 w1
    292               r2      x2 y2 z2 w2
    293               r3      x3 y3 z3 w3
    294 
    295               after apply shfl_xor operations to move data between registers r1..r3:
    296 
    297             threadId  00 01 10 11
    298                       x0 y0 z0 w0
    299              xor(01)->y1 x1 w1 z1
    300              xor(10)->z2 w2 x2 y2
    301              xor(11)->w3 z3 y3 x3
    302 
    303              and now x0..x3, and z0..z3 can be written out in order by all threads.
    304 
    305              In the current code, each register above is actually representing
    306              four integers to be written as uint4's to GMEM.
    307         */
    308 
    309         result[4]  = shfl_xor(result[4] , 1, 32);
    310         result[5]  = shfl_xor(result[5] , 1, 32);
    311         result[6]  = shfl_xor(result[6] , 1, 32);
    312         result[7]  = shfl_xor(result[7] , 1, 32);
    313 
    314         result[8]  = shfl_xor(result[8] , 2, 32);
    315         result[9]  = shfl_xor(result[9] , 2, 32);
    316         result[10] = shfl_xor(result[10], 2, 32);
    317         result[11] = shfl_xor(result[11], 2, 32);
    318 
    319         result[12] = shfl_xor(result[12], 3, 32);
    320         result[13] = shfl_xor(result[13], 3, 32);
    321         result[14] = shfl_xor(result[14], 3, 32);
    322         result[15] = shfl_xor(result[15], 3, 32);
    323 
    324         uint4* integral_row = integral.row(blockIdx.x);
    325         uint4 output;
    326 
    327         ///////
    328 
    329         if (threadIdx.x % 4 == 0)
    330             output = make_uint4(result[0], result[1], result[2], result[3]);
    331 
    332         if (threadIdx.x % 4 == 1)
    333             output = make_uint4(result[4], result[5], result[6], result[7]);
    334 
    335         if (threadIdx.x % 4 == 2)
    336             output = make_uint4(result[8], result[9], result[10], result[11]);
    337 
    338         if (threadIdx.x % 4 == 3)
    339             output = make_uint4(result[12], result[13], result[14], result[15]);
    340 
    341         integral_row[threadIdx.x % 4 + (threadIdx.x / 4) * 16] = output;
    342 
    343         ///////
    344 
    345         if (threadIdx.x % 4 == 2)
    346             output = make_uint4(result[0], result[1], result[2], result[3]);
    347 
    348         if (threadIdx.x % 4 == 3)
    349             output = make_uint4(result[4], result[5], result[6], result[7]);
    350 
    351         if (threadIdx.x % 4 == 0)
    352             output = make_uint4(result[8], result[9], result[10], result[11]);
    353 
    354         if (threadIdx.x % 4 == 1)
    355             output = make_uint4(result[12], result[13], result[14], result[15]);
    356 
    357         integral_row[(threadIdx.x + 2) % 4 + (threadIdx.x / 4) * 16 + 8] = output;
    358 
    359         // continuning from the above example,
    360         // this use of shfl_xor() places the y0..y3 and w0..w3 data
    361         // in order.
    362 
    363         #pragma unroll
    364         for (int i = 0; i < 16; ++i)
    365             result[i] = shfl_xor(result[i], 1, 32);
    366 
    367         if (threadIdx.x % 4 == 0)
    368             output = make_uint4(result[0], result[1], result[2], result[3]);
    369 
    370         if (threadIdx.x % 4 == 1)
    371             output = make_uint4(result[4], result[5], result[6], result[7]);
    372 
    373         if (threadIdx.x % 4 == 2)
    374             output = make_uint4(result[8], result[9], result[10], result[11]);
    375 
    376         if (threadIdx.x % 4 == 3)
    377             output = make_uint4(result[12], result[13], result[14], result[15]);
    378 
    379         integral_row[threadIdx.x % 4 + (threadIdx.x / 4) * 16 + 4] = output;
    380 
    381         ///////
    382 
    383         if (threadIdx.x % 4 == 2)
    384             output = make_uint4(result[0], result[1], result[2], result[3]);
    385 
    386         if (threadIdx.x % 4 == 3)
    387             output = make_uint4(result[4], result[5], result[6], result[7]);
    388 
    389         if (threadIdx.x % 4 == 0)
    390             output = make_uint4(result[8], result[9], result[10], result[11]);
    391 
    392         if (threadIdx.x % 4 == 1)
    393             output = make_uint4(result[12], result[13], result[14], result[15]);
    394 
    395         integral_row[(threadIdx.x + 2) % 4 + (threadIdx.x / 4) * 16 + 12] = output;
    396     #endif
    397     }
    398 
    399     __host__ static void horisontal_pass_8u_shfl(const GlobPtr<uchar> src, GlobPtr<uint> integral, int rows, int cols, cudaStream_t stream)
    400     {
    401         // each thread handles 16 values, use 1 block/row
    402         // save, because step is actually can't be less 512 bytes
    403         const int block = cols / 16;
    404 
    405         // launch 1 block / row
    406         const int grid = rows;
    407 
    408         CV_CUDEV_SAFE_CALL( cudaFuncSetCacheConfig(horisontal_pass_8u_shfl_kernel, cudaFuncCachePreferL1) );
    409 
    410         GlobPtr<uint4> src4 = globPtr((uint4*) src.data, src.step);
    411         GlobPtr<uint4> integral4 = globPtr((uint4*) integral.data, integral.step);
    412 
    413         horisontal_pass_8u_shfl_kernel<<<grid, block, 0, stream>>>(src4, integral4);
    414         CV_CUDEV_SAFE_CALL( cudaGetLastError() );
    415     }
    416 
    417     // vertical
    418 
    419     template <typename T>
    420     __global__ void vertical_pass(GlobPtr<T> integral, const int rows, const int cols)
    421     {
    422     #if CV_CUDEV_ARCH >= 300
    423         __shared__ T sums[32][9];
    424 
    425         const int tidx = blockIdx.x * blockDim.x + threadIdx.x;
    426         const int lane_id = tidx % 8;
    427 
    428         sums[threadIdx.x][threadIdx.y] = 0;
    429         __syncthreads();
    430 
    431         T stepSum = 0;
    432 
    433         int numBuckets = divUp(rows, blockDim.y);
    434         int y = threadIdx.y;
    435 
    436         while (numBuckets--)
    437         {
    438             T* p = integral.row(y) + tidx;
    439 
    440             T sum = (tidx < cols) && (y < rows) ? *p : 0;
    441 
    442             sums[threadIdx.x][threadIdx.y] = sum;
    443             __syncthreads();
    444 
    445             // place into SMEM
    446             // shfl scan reduce the SMEM, reformating so the column
    447             // sums are computed in a warp
    448             // then read out properly
    449             const int j = threadIdx.x % 8;
    450             const int k = threadIdx.x / 8 + threadIdx.y * 4;
    451 
    452             T partial_sum = sums[k][j];
    453 
    454             for (int i = 1; i <= 8; i *= 2)
    455             {
    456                 T n = shfl_up(partial_sum, i, 32);
    457 
    458                 if (lane_id >= i)
    459                     partial_sum += n;
    460             }
    461 
    462             sums[k][j] = partial_sum;
    463             __syncthreads();
    464 
    465             if (threadIdx.y > 0)
    466                 sum += sums[threadIdx.x][threadIdx.y - 1];
    467 
    468             sum += stepSum;
    469             stepSum += sums[threadIdx.x][blockDim.y - 1];
    470 
    471             __syncthreads();
    472 
    473             if ((tidx < cols) && (y < rows))
    474             {
    475                 *p = sum;
    476             }
    477 
    478             y += blockDim.y;
    479         }
    480     #else
    481         __shared__ T smem[32][32];
    482         __shared__ T prevVals[32];
    483 
    484         volatile T* smem_row = &smem[0][0] + 64 * threadIdx.y;
    485 
    486         if (threadIdx.y == 0)
    487             prevVals[threadIdx.x] = 0;
    488 
    489         __syncthreads();
    490 
    491         const int x = blockIdx.x * blockDim.x + threadIdx.x;
    492 
    493         int numBuckets = divUp(rows, 8 * 4);
    494         int offsetY = 0;
    495 
    496         while (numBuckets--)
    497         {
    498             const int curRowOffs = offsetY + threadIdx.y;
    499 
    500             T curElems[4];
    501             T temp[4];
    502 
    503             // load patch
    504 
    505             smem[threadIdx.y +  0][threadIdx.x] = 0.0f;
    506             smem[threadIdx.y +  8][threadIdx.x] = 0.0f;
    507             smem[threadIdx.y + 16][threadIdx.x] = 0.0f;
    508             smem[threadIdx.y + 24][threadIdx.x] = 0.0f;
    509 
    510             if (x < cols)
    511             {
    512                 for (int i = 0; i < 4; ++i)
    513                 {
    514                     if (curRowOffs + i * 8 < rows)
    515                         smem[threadIdx.y + i * 8][threadIdx.x] = integral(curRowOffs + i * 8, x);
    516                 }
    517             }
    518 
    519             __syncthreads();
    520 
    521             // reduce
    522 
    523             curElems[0] = smem[threadIdx.x][threadIdx.y     ];
    524             curElems[1] = smem[threadIdx.x][threadIdx.y +  8];
    525             curElems[2] = smem[threadIdx.x][threadIdx.y + 16];
    526             curElems[3] = smem[threadIdx.x][threadIdx.y + 24];
    527 
    528             __syncthreads();
    529 
    530             temp[0] = curElems[0] = warpScanInclusive(curElems[0], smem_row, threadIdx.x);
    531             temp[1] = curElems[1] = warpScanInclusive(curElems[1], smem_row, threadIdx.x);
    532             temp[2] = curElems[2] = warpScanInclusive(curElems[2], smem_row, threadIdx.x);
    533             temp[3] = curElems[3] = warpScanInclusive(curElems[3], smem_row, threadIdx.x);
    534 
    535             curElems[0] += prevVals[threadIdx.y     ];
    536             curElems[1] += prevVals[threadIdx.y +  8];
    537             curElems[2] += prevVals[threadIdx.y + 16];
    538             curElems[3] += prevVals[threadIdx.y + 24];
    539 
    540             __syncthreads();
    541 
    542             if (threadIdx.x == 31)
    543             {
    544                 prevVals[threadIdx.y     ] += temp[0];
    545                 prevVals[threadIdx.y +  8] += temp[1];
    546                 prevVals[threadIdx.y + 16] += temp[2];
    547                 prevVals[threadIdx.y + 24] += temp[3];
    548             }
    549 
    550             smem[threadIdx.y     ][threadIdx.x] = curElems[0];
    551             smem[threadIdx.y +  8][threadIdx.x] = curElems[1];
    552             smem[threadIdx.y + 16][threadIdx.x] = curElems[2];
    553             smem[threadIdx.y + 24][threadIdx.x] = curElems[3];
    554 
    555             __syncthreads();
    556 
    557             // store patch
    558 
    559             if (x < cols)
    560             {
    561                 // read 4 value from source
    562                 for (int i = 0; i < 4; ++i)
    563                 {
    564                     if (curRowOffs + i * 8 < rows)
    565                         integral(curRowOffs + i * 8, x) = smem[threadIdx.x][threadIdx.y + i * 8];
    566                 }
    567             }
    568 
    569             __syncthreads();
    570 
    571             offsetY += 8 * 4;
    572         }
    573     #endif
    574     }
    575 
    576     template <typename T>
    577     __host__ void vertical_pass(const GlobPtr<T>& integral, int rows, int cols, cudaStream_t stream)
    578     {
    579         const dim3 block(32, 8);
    580         const dim3 grid(divUp(cols, block.x));
    581 
    582         vertical_pass<<<grid, block, 0, stream>>>(integral, rows, cols);
    583         CV_CUDEV_SAFE_CALL( cudaGetLastError() );
    584     }
    585 
    586     // integral
    587 
    588     template <class SrcPtr, typename D>
    589     __host__ void integral(const SrcPtr& src, const GlobPtr<D>& dst, int rows, int cols, cudaStream_t stream)
    590     {
    591         horizontal_pass(src, dst, rows, cols, stream);
    592         vertical_pass(dst, rows, cols, stream);
    593 
    594         if (stream == 0)
    595             CV_CUDEV_SAFE_CALL( cudaDeviceSynchronize() );
    596     }
    597 
    598     __host__ static void integral(const GlobPtr<uchar>& src, const GlobPtr<uint>& dst, int rows, int cols, cudaStream_t stream)
    599     {
    600         if (deviceSupports(FEATURE_SET_COMPUTE_30)
    601             && (cols % 16 == 0)
    602             && reinterpret_cast<intptr_t>(src.data) % 32 == 0
    603             && reinterpret_cast<intptr_t>(dst.data) % 32 == 0)
    604         {
    605             horisontal_pass_8u_shfl(src, dst, rows, cols, stream);
    606         }
    607         else
    608         {
    609             horizontal_pass(src, dst, rows, cols, stream);
    610         }
    611 
    612         vertical_pass(dst, rows, cols, stream);
    613 
    614         if (stream == 0)
    615             CV_CUDEV_SAFE_CALL( cudaDeviceSynchronize() );
    616     }
    617 
    618     __host__ __forceinline__ void integral(const GlobPtr<uchar>& src, const GlobPtr<int>& dst, int rows, int cols, cudaStream_t stream)
    619     {
    620         GlobPtr<uint> dstui = globPtr((uint*) dst.data, dst.step);
    621         integral(src, dstui, rows, cols, stream);
    622     }
    623 }
    624 
    625 }}
    626 
    627 #endif
    628