Home | History | Annotate | Download | only in src
      1 // This file is part of OpenCV project.
      2 // It is subject to the license terms in the LICENSE file found in the top-level directory
      3 // of this distribution and at http://opencv.org/license.html.
      4 
      5 // Copyright (C) 2014, Advanced Micro Devices, Inc., all rights reserved.
      6 // Third party copyrights are property of their respective owners.
      7 
      8 #include "precomp.hpp"
      9 #ifndef __OPENCV_FAST_NLMEANS_DENOISING_OPENCL_HPP__
     10 #define __OPENCV_FAST_NLMEANS_DENOISING_OPENCL_HPP__
     11 
     12 #include "opencl_kernels_photo.hpp"
     13 
     14 #ifdef HAVE_OPENCL
     15 
     16 namespace cv {
     17 
     18 enum
     19 {
     20     BLOCK_ROWS = 32,
     21     BLOCK_COLS = 32,
     22     CTA_SIZE_INTEL = 64,
     23     CTA_SIZE_DEFAULT = 256
     24 };
     25 
     26 static int divUp(int a, int b)
     27 {
     28     return (a + b - 1) / b;
     29 }
     30 
     31 template <typename FT, typename ST, typename WT>
     32 static bool ocl_calcAlmostDist2Weight(UMat & almostDist2Weight,
     33                                       int searchWindowSize, int templateWindowSize,
     34                                       const FT *h, int hn, int cn, int normType,
     35                                       int & almostTemplateWindowSizeSqBinShift)
     36 {
     37     const WT maxEstimateSumValue = searchWindowSize * searchWindowSize *
     38         std::numeric_limits<ST>::max();
     39     int fixedPointMult = (int)std::min<WT>(std::numeric_limits<WT>::max() / maxEstimateSumValue,
     40                                            std::numeric_limits<int>::max());
     41     int depth = DataType<FT>::depth;
     42     bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0;
     43 
     44     if (depth == CV_64F && !doubleSupport)
     45         return false;
     46 
     47     // precalc weight for every possible l2 dist between blocks
     48     // additional optimization of precalced weights to replace division(averaging) by binary shift
     49     CV_Assert(templateWindowSize <= 46340); // sqrt(INT_MAX)
     50     int templateWindowSizeSq = templateWindowSize * templateWindowSize;
     51     almostTemplateWindowSizeSqBinShift = getNearestPowerOf2(templateWindowSizeSq);
     52     FT almostDist2ActualDistMultiplier = (FT)(1 << almostTemplateWindowSizeSqBinShift) / templateWindowSizeSq;
     53 
     54     const FT WEIGHT_THRESHOLD = 1e-3f;
     55     int maxDist = normType == NORM_L1 ? std::numeric_limits<ST>::max() * cn :
     56         std::numeric_limits<ST>::max() * std::numeric_limits<ST>::max() * cn;
     57     int almostMaxDist = (int)(maxDist / almostDist2ActualDistMultiplier + 1);
     58     FT den[4];
     59     CV_Assert(hn > 0 && hn <= 4);
     60     for (int i=0; i<hn; i++)
     61         den[i] = 1.0f / (h[i] * h[i] * cn);
     62 
     63     almostDist2Weight.create(1, almostMaxDist, CV_32SC(hn == 3 ? 4 : hn));
     64 
     65     char buf[40];
     66     ocl::Kernel k("calcAlmostDist2Weight", ocl::photo::nlmeans_oclsrc,
     67                   format("-D OP_CALC_WEIGHTS -D FT=%s -D w_t=%s"
     68                          " -D wlut_t=%s -D convert_wlut_t=%s%s%s",
     69                          ocl::typeToStr(depth), ocl::typeToStr(CV_MAKE_TYPE(depth, hn)),
     70                          ocl::typeToStr(CV_32SC(hn)), ocl::convertTypeStr(depth, CV_32S, hn, buf),
     71                          doubleSupport ? " -D DOUBLE_SUPPORT" : "",
     72                          normType == NORM_L1 ? " -D ABS" : ""));
     73     if (k.empty())
     74         return false;
     75 
     76     k.args(ocl::KernelArg::PtrWriteOnly(almostDist2Weight), almostMaxDist,
     77            almostDist2ActualDistMultiplier, fixedPointMult,
     78            ocl::KernelArg::Constant(den, (hn == 3 ? 4 : hn)*sizeof(FT)), WEIGHT_THRESHOLD);
     79 
     80     size_t globalsize[1] = { almostMaxDist };
     81     return k.run(1, globalsize, NULL, false);
     82 }
     83 
     84 static bool ocl_fastNlMeansDenoising(InputArray _src, OutputArray _dst, const float *h, int hn,
     85                                      int templateWindowSize, int searchWindowSize, int normType)
     86 {
     87     int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
     88     int ctaSize = ocl::Device::getDefault().isIntel() ? CTA_SIZE_INTEL : CTA_SIZE_DEFAULT;
     89     Size size = _src.size();
     90 
     91     if (cn < 1 || cn > 4 || ((normType != NORM_L2 || depth != CV_8U) &&
     92                              (normType != NORM_L1 || (depth != CV_8U && depth != CV_16U))))
     93         return false;
     94 
     95     int templateWindowHalfWize = templateWindowSize / 2;
     96     int searchWindowHalfSize = searchWindowSize / 2;
     97     templateWindowSize  = templateWindowHalfWize * 2 + 1;
     98     searchWindowSize = searchWindowHalfSize * 2 + 1;
     99     int nblocksx = divUp(size.width, BLOCK_COLS), nblocksy = divUp(size.height, BLOCK_ROWS);
    100     int almostTemplateWindowSizeSqBinShift = -1;
    101 
    102     char buf[4][40];
    103     String opts = format("-D OP_CALC_FASTNLMEANS -D TEMPLATE_SIZE=%d -D SEARCH_SIZE=%d"
    104                          " -D pixel_t=%s -D int_t=%s -D wlut_t=%s"
    105                          " -D weight_t=%s -D convert_weight_t=%s -D sum_t=%s -D convert_sum_t=%s"
    106                          " -D BLOCK_COLS=%d -D BLOCK_ROWS=%d"
    107                          " -D CTA_SIZE=%d -D TEMPLATE_SIZE2=%d -D SEARCH_SIZE2=%d"
    108                          " -D convert_int_t=%s -D cn=%d -D psz=%d -D convert_pixel_t=%s%s",
    109                          templateWindowSize, searchWindowSize,
    110                          ocl::typeToStr(type), ocl::typeToStr(CV_32SC(cn)),
    111                          ocl::typeToStr(CV_32SC(hn)),
    112                          depth == CV_8U ? ocl::typeToStr(CV_32SC(hn)) :
    113                          format("long%s", hn > 1 ? format("%d", hn).c_str() : "").c_str(),
    114                          depth == CV_8U ? ocl::convertTypeStr(CV_32S, CV_32S, hn, buf[0]) :
    115                          format("convert_long%s", hn > 1 ? format("%d", hn).c_str() : "").c_str(),
    116                          depth == CV_8U ? ocl::typeToStr(CV_32SC(cn)) :
    117                          format("long%s", cn > 1 ? format("%d", cn).c_str() : "").c_str(),
    118                          depth == CV_8U ? ocl::convertTypeStr(depth, CV_32S, cn, buf[1]) :
    119                          format("convert_long%s", cn > 1 ? format("%d", cn).c_str() : "").c_str(),
    120                          BLOCK_COLS, BLOCK_ROWS,
    121                          ctaSize, templateWindowHalfWize, searchWindowHalfSize,
    122                          ocl::convertTypeStr(depth, CV_32S, cn, buf[2]), cn,
    123                          (depth == CV_8U ? sizeof(uchar) : sizeof(ushort)) * (cn == 3 ? 4 : cn),
    124                          ocl::convertTypeStr(CV_32S, depth, cn, buf[3]),
    125                          normType == NORM_L1 ? " -D ABS" : "");
    126 
    127     ocl::Kernel k("fastNlMeansDenoising", ocl::photo::nlmeans_oclsrc, opts);
    128     if (k.empty())
    129         return false;
    130 
    131     UMat almostDist2Weight;
    132     if ((depth == CV_8U &&
    133          !ocl_calcAlmostDist2Weight<float, uchar, int>(almostDist2Weight,
    134                                                        searchWindowSize, templateWindowSize,
    135                                                        h, hn, cn, normType,
    136                                                        almostTemplateWindowSizeSqBinShift)) ||
    137         (depth == CV_16U &&
    138          !ocl_calcAlmostDist2Weight<float, ushort, int64>(almostDist2Weight,
    139                                                           searchWindowSize, templateWindowSize,
    140                                                           h, hn, cn, normType,
    141                                                           almostTemplateWindowSizeSqBinShift)))
    142         return false;
    143     CV_Assert(almostTemplateWindowSizeSqBinShift >= 0);
    144 
    145     UMat srcex;
    146     int borderSize = searchWindowHalfSize + templateWindowHalfWize;
    147     if (cn == 3) {
    148         srcex.create(size.height + 2*borderSize, size.width + 2*borderSize, CV_MAKE_TYPE(depth, 4));
    149         UMat src(srcex, Rect(borderSize, borderSize, size.width, size.height));
    150         int from_to[] = { 0,0, 1,1, 2,2 };
    151         mixChannels(std::vector<UMat>(1, _src.getUMat()), std::vector<UMat>(1, src), from_to, 3);
    152         copyMakeBorder(src, srcex, borderSize, borderSize, borderSize, borderSize,
    153                        BORDER_DEFAULT|BORDER_ISOLATED); // create borders in place
    154     }
    155     else
    156         copyMakeBorder(_src, srcex, borderSize, borderSize, borderSize, borderSize, BORDER_DEFAULT);
    157 
    158     _dst.create(size, type);
    159     UMat dst;
    160     if (cn == 3)
    161         dst.create(size, CV_MAKE_TYPE(depth, 4));
    162     else
    163         dst = _dst.getUMat();
    164 
    165     int searchWindowSizeSq = searchWindowSize * searchWindowSize;
    166     Size upColSumSize(size.width, searchWindowSizeSq * nblocksy);
    167     Size colSumSize(nblocksx * templateWindowSize, searchWindowSizeSq * nblocksy);
    168     UMat buffer(upColSumSize + colSumSize, CV_32SC(cn));
    169 
    170     srcex = srcex(Rect(Point(borderSize, borderSize), size));
    171     k.args(ocl::KernelArg::ReadOnlyNoSize(srcex), ocl::KernelArg::WriteOnly(dst),
    172            ocl::KernelArg::PtrReadOnly(almostDist2Weight),
    173            ocl::KernelArg::PtrReadOnly(buffer), almostTemplateWindowSizeSqBinShift);
    174 
    175     size_t globalsize[2] = { nblocksx * ctaSize, nblocksy }, localsize[2] = { ctaSize, 1 };
    176     if (!k.run(2, globalsize, localsize, false)) return false;
    177 
    178     if (cn == 3) {
    179         int from_to[] = { 0,0, 1,1, 2,2 };
    180         mixChannels(std::vector<UMat>(1, dst), std::vector<UMat>(1, _dst.getUMat()), from_to, 3);
    181     }
    182 
    183     return true;
    184 }
    185 
    186 static bool ocl_fastNlMeansDenoisingColored( InputArray _src, OutputArray _dst,
    187                                       float h, float hForColorComponents,
    188                                       int templateWindowSize, int searchWindowSize)
    189 {
    190     UMat src = _src.getUMat();
    191     _dst.create(src.size(), src.type());
    192     UMat dst = _dst.getUMat();
    193 
    194     UMat src_lab;
    195     cvtColor(src, src_lab, COLOR_LBGR2Lab);
    196 
    197     UMat l(src.size(), CV_8U);
    198     UMat ab(src.size(), CV_8UC2);
    199     std::vector<UMat> l_ab(2), l_ab_denoised(2);
    200     l_ab[0] = l;
    201     l_ab[1] = ab;
    202     l_ab_denoised[0].create(src.size(), CV_8U);
    203     l_ab_denoised[1].create(src.size(), CV_8UC2);
    204 
    205     int from_to[] = { 0,0, 1,1, 2,2 };
    206     mixChannels(std::vector<UMat>(1, src_lab), l_ab, from_to, 3);
    207 
    208     fastNlMeansDenoising(l_ab[0], l_ab_denoised[0], h, templateWindowSize, searchWindowSize);
    209     fastNlMeansDenoising(l_ab[1], l_ab_denoised[1], hForColorComponents, templateWindowSize, searchWindowSize);
    210 
    211     UMat dst_lab(src.size(), CV_8UC3);
    212     mixChannels(l_ab_denoised, std::vector<UMat>(1, dst_lab), from_to, 3);
    213 
    214     cvtColor(dst_lab, dst, COLOR_Lab2LBGR, src.channels());
    215     return true;
    216 }
    217 
    218 }
    219 
    220 #endif
    221 #endif
    222