Home | History | Annotate | Download | only in cuda
      1 /*M///////////////////////////////////////////////////////////////////////////////////////
      2 //
      3 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
      4 //
      5 //  By downloading, copying, installing or using the software you agree to this license.
      6 //  If you do not agree to this license, do not download, install,
      7 //  copy or use the software.
      8 //
      9 //
     10 //                           License Agreement
     11 //                For Open Source Computer Vision Library
     12 //
     13 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
     14 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
     15 // Third party copyrights are property of their respective owners.
     16 //
     17 // Redistribution and use in source and binary forms, with or without modification,
     18 // are permitted provided that the following conditions are met:
     19 //
     20 //   * Redistribution's of source code must retain the above copyright notice,
     21 //     this list of conditions and the following disclaimer.
     22 //
     23 //   * Redistribution's in binary form must reproduce the above copyright notice,
     24 //     this list of conditions and the following disclaimer in the documentation
     25 //     and/or other materials provided with the distribution.
     26 //
     27 //   * The name of the copyright holders may not be used to endorse or promote products
     28 //     derived from this software without specific prior written permission.
     29 //
     30 // This software is provided by the copyright holders and contributors "as is" and
     31 // any express or implied warranties, including, but not limited to, the implied
     32 // warranties of merchantability and fitness for a particular purpose are disclaimed.
     33 // In no event shall the Intel Corporation or contributors be liable for any direct,
     34 // indirect, incidental, special, exemplary, or consequential damages
     35 // (including, but not limited to, procurement of substitute goods or services;
     36 // loss of use, data, or profits; or business interruption) however caused
     37 // and on any theory of liability, whether in contract, strict liability,
     38 // or tort (including negligence or otherwise) arising in any way out of
     39 // the use of this software, even if advised of the possibility of such damage.
     40 //
     41 //M*/
     42 
     43 #ifndef OPENCV_CUDA_EMULATION_HPP_
     44 #define OPENCV_CUDA_EMULATION_HPP_
     45 
     46 #include "common.hpp"
     47 #include "warp_reduce.hpp"
     48 
     49 /** @file
     50  * @deprecated Use @ref cudev instead.
     51  */
     52 
     53 //! @cond IGNORED
     54 
     55 namespace cv { namespace cuda { namespace device
     56 {
     57     struct Emulation
     58     {
     59 
     60         static __device__ __forceinline__ int syncthreadsOr(int pred)
     61         {
     62 #if defined (__CUDA_ARCH__) && (__CUDA_ARCH__ < 200)
     63                 // just campilation stab
     64                 return 0;
     65 #else
     66                 return __syncthreads_or(pred);
     67 #endif
     68         }
     69 
     70         template<int CTA_SIZE>
     71         static __forceinline__ __device__ int Ballot(int predicate)
     72         {
     73 #if defined (__CUDA_ARCH__) && (__CUDA_ARCH__ >= 200)
     74             return __ballot(predicate);
     75 #else
     76             __shared__ volatile int cta_buffer[CTA_SIZE];
     77 
     78             int tid = threadIdx.x;
     79             cta_buffer[tid] = predicate ? (1 << (tid & 31)) : 0;
     80             return warp_reduce(cta_buffer);
     81 #endif
     82         }
     83 
     84         struct smem
     85         {
     86             enum { TAG_MASK = (1U << ( (sizeof(unsigned int) << 3) - 5U)) - 1U };
     87 
     88             template<typename T>
     89             static __device__ __forceinline__ T atomicInc(T* address, T val)
     90             {
     91 #if defined (__CUDA_ARCH__) && (__CUDA_ARCH__ < 120)
     92                 T count;
     93                 unsigned int tag = threadIdx.x << ( (sizeof(unsigned int) << 3) - 5U);
     94                 do
     95                 {
     96                     count = *address & TAG_MASK;
     97                     count = tag | (count + 1);
     98                     *address = count;
     99                 } while (*address != count);
    100 
    101                 return (count & TAG_MASK) - 1;
    102 #else
    103                 return ::atomicInc(address, val);
    104 #endif
    105             }
    106 
    107             template<typename T>
    108             static __device__ __forceinline__ T atomicAdd(T* address, T val)
    109             {
    110 #if defined (__CUDA_ARCH__) && (__CUDA_ARCH__ < 120)
    111                 T count;
    112                 unsigned int tag = threadIdx.x << ( (sizeof(unsigned int) << 3) - 5U);
    113                 do
    114                 {
    115                     count = *address & TAG_MASK;
    116                     count = tag | (count + val);
    117                     *address = count;
    118                 } while (*address != count);
    119 
    120                 return (count & TAG_MASK) - val;
    121 #else
    122                 return ::atomicAdd(address, val);
    123 #endif
    124             }
    125 
    126             template<typename T>
    127             static __device__ __forceinline__ T atomicMin(T* address, T val)
    128             {
    129 #if defined (__CUDA_ARCH__) && (__CUDA_ARCH__ < 120)
    130                 T count = ::min(*address, val);
    131                 do
    132                 {
    133                     *address = count;
    134                 } while (*address > count);
    135 
    136                 return count;
    137 #else
    138                 return ::atomicMin(address, val);
    139 #endif
    140             }
    141         }; // struct cmem
    142 
    143         struct glob
    144         {
    145             static __device__ __forceinline__ int atomicAdd(int* address, int val)
    146             {
    147                 return ::atomicAdd(address, val);
    148             }
    149             static __device__ __forceinline__ unsigned int atomicAdd(unsigned int* address, unsigned int val)
    150             {
    151                 return ::atomicAdd(address, val);
    152             }
    153             static __device__ __forceinline__ float atomicAdd(float* address, float val)
    154             {
    155             #if __CUDA_ARCH__ >= 200
    156                 return ::atomicAdd(address, val);
    157             #else
    158                 int* address_as_i = (int*) address;
    159                 int old = *address_as_i, assumed;
    160                 do {
    161                     assumed = old;
    162                     old = ::atomicCAS(address_as_i, assumed,
    163                         __float_as_int(val + __int_as_float(assumed)));
    164                 } while (assumed != old);
    165                 return __int_as_float(old);
    166             #endif
    167             }
    168             static __device__ __forceinline__ double atomicAdd(double* address, double val)
    169             {
    170             #if __CUDA_ARCH__ >= 130
    171                 unsigned long long int* address_as_ull = (unsigned long long int*) address;
    172                 unsigned long long int old = *address_as_ull, assumed;
    173                 do {
    174                     assumed = old;
    175                     old = ::atomicCAS(address_as_ull, assumed,
    176                         __double_as_longlong(val + __longlong_as_double(assumed)));
    177                 } while (assumed != old);
    178                 return __longlong_as_double(old);
    179             #else
    180                 (void) address;
    181                 (void) val;
    182                 return 0.0;
    183             #endif
    184             }
    185 
    186             static __device__ __forceinline__ int atomicMin(int* address, int val)
    187             {
    188                 return ::atomicMin(address, val);
    189             }
    190             static __device__ __forceinline__ float atomicMin(float* address, float val)
    191             {
    192             #if __CUDA_ARCH__ >= 120
    193                 int* address_as_i = (int*) address;
    194                 int old = *address_as_i, assumed;
    195                 do {
    196                     assumed = old;
    197                     old = ::atomicCAS(address_as_i, assumed,
    198                         __float_as_int(::fminf(val, __int_as_float(assumed))));
    199                 } while (assumed != old);
    200                 return __int_as_float(old);
    201             #else
    202                 (void) address;
    203                 (void) val;
    204                 return 0.0f;
    205             #endif
    206             }
    207             static __device__ __forceinline__ double atomicMin(double* address, double val)
    208             {
    209             #if __CUDA_ARCH__ >= 130
    210                 unsigned long long int* address_as_ull = (unsigned long long int*) address;
    211                 unsigned long long int old = *address_as_ull, assumed;
    212                 do {
    213                     assumed = old;
    214                     old = ::atomicCAS(address_as_ull, assumed,
    215                         __double_as_longlong(::fmin(val, __longlong_as_double(assumed))));
    216                 } while (assumed != old);
    217                 return __longlong_as_double(old);
    218             #else
    219                 (void) address;
    220                 (void) val;
    221                 return 0.0;
    222             #endif
    223             }
    224 
    225             static __device__ __forceinline__ int atomicMax(int* address, int val)
    226             {
    227                 return ::atomicMax(address, val);
    228             }
    229             static __device__ __forceinline__ float atomicMax(float* address, float val)
    230             {
    231             #if __CUDA_ARCH__ >= 120
    232                 int* address_as_i = (int*) address;
    233                 int old = *address_as_i, assumed;
    234                 do {
    235                     assumed = old;
    236                     old = ::atomicCAS(address_as_i, assumed,
    237                         __float_as_int(::fmaxf(val, __int_as_float(assumed))));
    238                 } while (assumed != old);
    239                 return __int_as_float(old);
    240             #else
    241                 (void) address;
    242                 (void) val;
    243                 return 0.0f;
    244             #endif
    245             }
    246             static __device__ __forceinline__ double atomicMax(double* address, double val)
    247             {
    248             #if __CUDA_ARCH__ >= 130
    249                 unsigned long long int* address_as_ull = (unsigned long long int*) address;
    250                 unsigned long long int old = *address_as_ull, assumed;
    251                 do {
    252                     assumed = old;
    253                     old = ::atomicCAS(address_as_ull, assumed,
    254                         __double_as_longlong(::fmax(val, __longlong_as_double(assumed))));
    255                 } while (assumed != old);
    256                 return __longlong_as_double(old);
    257             #else
    258                 (void) address;
    259                 (void) val;
    260                 return 0.0;
    261             #endif
    262             }
    263         };
    264     }; //struct Emulation
    265 }}} // namespace cv { namespace cuda { namespace cudev
    266 
    267 //! @endcond
    268 
    269 #endif /* OPENCV_CUDA_EMULATION_HPP_ */
    270