Home | History | Annotate | Download | only in gpu-basics-similarity
      1 #include <iostream>                   // Console I/O
      2 #include <sstream>                    // String to number conversion
      3 
      4 #include <opencv2/core.hpp>      // Basic OpenCV structures
      5 #include <opencv2/core/utility.hpp>
      6 #include <opencv2/imgproc.hpp>// Image processing methods for the CPU
      7 #include <opencv2/imgcodecs.hpp>// Read images
      8 
      9 // CUDA structures and methods
     10 #include <opencv2/cudaarithm.hpp>
     11 #include <opencv2/cudafilters.hpp>
     12 
     13 using namespace std;
     14 using namespace cv;
     15 
     16 double getPSNR(const Mat& I1, const Mat& I2);      // CPU versions
     17 Scalar getMSSIM( const Mat& I1, const Mat& I2);
     18 
     19 double getPSNR_CUDA(const Mat& I1, const Mat& I2);  // Basic CUDA versions
     20 Scalar getMSSIM_CUDA( const Mat& I1, const Mat& I2);
     21 
     22 //! [psnr]
     23 struct BufferPSNR                                     // Optimized CUDA versions
     24 {   // Data allocations are very expensive on CUDA. Use a buffer to solve: allocate once reuse later.
     25     cuda::GpuMat gI1, gI2, gs, t1,t2;
     26 
     27     cuda::GpuMat buf;
     28 };
     29 //! [psnr]
     30 double getPSNR_CUDA_optimized(const Mat& I1, const Mat& I2, BufferPSNR& b);
     31 
     32 //! [ssim]
     33 struct BufferMSSIM                                     // Optimized CUDA versions
     34 {   // Data allocations are very expensive on CUDA. Use a buffer to solve: allocate once reuse later.
     35     cuda::GpuMat gI1, gI2, gs, t1,t2;
     36 
     37     cuda::GpuMat I1_2, I2_2, I1_I2;
     38     vector<cuda::GpuMat> vI1, vI2;
     39 
     40     cuda::GpuMat mu1, mu2;
     41     cuda::GpuMat mu1_2, mu2_2, mu1_mu2;
     42 
     43     cuda::GpuMat sigma1_2, sigma2_2, sigma12;
     44     cuda::GpuMat t3;
     45 
     46     cuda::GpuMat ssim_map;
     47 
     48     cuda::GpuMat buf;
     49 };
     50 //! [ssim]
     51 Scalar getMSSIM_CUDA_optimized( const Mat& i1, const Mat& i2, BufferMSSIM& b);
     52 
     53 static void help()
     54 {
     55     cout
     56         << "\n--------------------------------------------------------------------------" << endl
     57         << "This program shows how to port your CPU code to CUDA or write that from scratch." << endl
     58         << "You can see the performance improvement for the similarity check methods (PSNR and SSIM)."  << endl
     59         << "Usage:"                                                               << endl
     60         << "./gpu-basics-similarity referenceImage comparedImage numberOfTimesToRunTest(like 10)." << endl
     61         << "--------------------------------------------------------------------------"   << endl
     62         << endl;
     63 }
     64 
     65 int main(int, char *argv[])
     66 {
     67     help();
     68     Mat I1 = imread(argv[1]);           // Read the two images
     69     Mat I2 = imread(argv[2]);
     70 
     71     if (!I1.data || !I2.data)           // Check for success
     72     {
     73         cout << "Couldn't read the image";
     74         return 0;
     75     }
     76 
     77     BufferPSNR bufferPSNR;
     78     BufferMSSIM bufferMSSIM;
     79 
     80     int TIMES = 10;
     81     stringstream sstr(argv[3]);
     82     sstr >> TIMES;
     83     double time, result = 0;
     84 
     85     //------------------------------- PSNR CPU ----------------------------------------------------
     86     time = (double)getTickCount();
     87 
     88     for (int i = 0; i < TIMES; ++i)
     89         result = getPSNR(I1,I2);
     90 
     91     time = 1000*((double)getTickCount() - time)/getTickFrequency();
     92     time /= TIMES;
     93 
     94     cout << "Time of PSNR CPU (averaged for " << TIMES << " runs): " << time << " milliseconds."
     95         << " With result of: " << result << endl;
     96 
     97     //------------------------------- PSNR CUDA ----------------------------------------------------
     98     time = (double)getTickCount();
     99 
    100     for (int i = 0; i < TIMES; ++i)
    101         result = getPSNR_CUDA(I1,I2);
    102 
    103     time = 1000*((double)getTickCount() - time)/getTickFrequency();
    104     time /= TIMES;
    105 
    106     cout << "Time of PSNR CUDA (averaged for " << TIMES << " runs): " << time << " milliseconds."
    107         << " With result of: " <<  result << endl;
    108 
    109     //------------------------------- PSNR CUDA Optimized--------------------------------------------
    110     time = (double)getTickCount();                                  // Initial call
    111     result = getPSNR_CUDA_optimized(I1, I2, bufferPSNR);
    112     time = 1000*((double)getTickCount() - time)/getTickFrequency();
    113     cout << "Initial call CUDA optimized:              " << time  <<" milliseconds."
    114         << " With result of: " << result << endl;
    115 
    116     time = (double)getTickCount();
    117     for (int i = 0; i < TIMES; ++i)
    118         result = getPSNR_CUDA_optimized(I1, I2, bufferPSNR);
    119 
    120     time = 1000*((double)getTickCount() - time)/getTickFrequency();
    121     time /= TIMES;
    122 
    123     cout << "Time of PSNR CUDA OPTIMIZED ( / " << TIMES << " runs): " << time
    124         << " milliseconds." << " With result of: " <<  result << endl << endl;
    125 
    126 
    127     //------------------------------- SSIM CPU -----------------------------------------------------
    128     Scalar x;
    129     time = (double)getTickCount();
    130 
    131     for (int i = 0; i < TIMES; ++i)
    132         x = getMSSIM(I1,I2);
    133 
    134     time = 1000*((double)getTickCount() - time)/getTickFrequency();
    135     time /= TIMES;
    136 
    137     cout << "Time of MSSIM CPU (averaged for " << TIMES << " runs): " << time << " milliseconds."
    138         << " With result of B" << x.val[0] << " G" << x.val[1] << " R" << x.val[2] << endl;
    139 
    140     //------------------------------- SSIM CUDA -----------------------------------------------------
    141     time = (double)getTickCount();
    142 
    143     for (int i = 0; i < TIMES; ++i)
    144         x = getMSSIM_CUDA(I1,I2);
    145 
    146     time = 1000*((double)getTickCount() - time)/getTickFrequency();
    147     time /= TIMES;
    148 
    149     cout << "Time of MSSIM CUDA (averaged for " << TIMES << " runs): " << time << " milliseconds."
    150         << " With result of B" << x.val[0] << " G" << x.val[1] << " R" << x.val[2] << endl;
    151 
    152     //------------------------------- SSIM CUDA Optimized--------------------------------------------
    153     time = (double)getTickCount();
    154     x = getMSSIM_CUDA_optimized(I1,I2, bufferMSSIM);
    155     time = 1000*((double)getTickCount() - time)/getTickFrequency();
    156     cout << "Time of MSSIM CUDA Initial Call            " << time << " milliseconds."
    157         << " With result of B" << x.val[0] << " G" << x.val[1] << " R" << x.val[2] << endl;
    158 
    159     time = (double)getTickCount();
    160 
    161     for (int i = 0; i < TIMES; ++i)
    162         x = getMSSIM_CUDA_optimized(I1,I2, bufferMSSIM);
    163 
    164     time = 1000*((double)getTickCount() - time)/getTickFrequency();
    165     time /= TIMES;
    166 
    167     cout << "Time of MSSIM CUDA OPTIMIZED ( / " << TIMES << " runs): " << time << " milliseconds."
    168         << " With result of B" << x.val[0] << " G" << x.val[1] << " R" << x.val[2] << endl << endl;
    169     return 0;
    170 }
    171 
    172 //! [getpsnr]
    173 double getPSNR(const Mat& I1, const Mat& I2)
    174 {
    175     Mat s1;
    176     absdiff(I1, I2, s1);       // |I1 - I2|
    177     s1.convertTo(s1, CV_32F);  // cannot make a square on 8 bits
    178     s1 = s1.mul(s1);           // |I1 - I2|^2
    179 
    180     Scalar s = sum(s1);         // sum elements per channel
    181 
    182     double sse = s.val[0] + s.val[1] + s.val[2]; // sum channels
    183 
    184     if( sse <= 1e-10) // for small values return zero
    185         return 0;
    186     else
    187     {
    188         double  mse =sse /(double)(I1.channels() * I1.total());
    189         double psnr = 10.0*log10((255*255)/mse);
    190         return psnr;
    191     }
    192 }
    193 //! [getpsnr]
    194 
    195 //! [getpsnropt]
    196 double getPSNR_CUDA_optimized(const Mat& I1, const Mat& I2, BufferPSNR& b)
    197 {
    198     b.gI1.upload(I1);
    199     b.gI2.upload(I2);
    200 
    201     b.gI1.convertTo(b.t1, CV_32F);
    202     b.gI2.convertTo(b.t2, CV_32F);
    203 
    204     cuda::absdiff(b.t1.reshape(1), b.t2.reshape(1), b.gs);
    205     cuda::multiply(b.gs, b.gs, b.gs);
    206 
    207     double sse = cuda::sum(b.gs, b.buf)[0];
    208 
    209     if( sse <= 1e-10) // for small values return zero
    210         return 0;
    211     else
    212     {
    213         double mse = sse /(double)(I1.channels() * I1.total());
    214         double psnr = 10.0*log10((255*255)/mse);
    215         return psnr;
    216     }
    217 }
    218 //! [getpsnropt]
    219 
    220 //! [getpsnrcuda]
    221 double getPSNR_CUDA(const Mat& I1, const Mat& I2)
    222 {
    223     cuda::GpuMat gI1, gI2, gs, t1,t2;
    224 
    225     gI1.upload(I1);
    226     gI2.upload(I2);
    227 
    228     gI1.convertTo(t1, CV_32F);
    229     gI2.convertTo(t2, CV_32F);
    230 
    231     cuda::absdiff(t1.reshape(1), t2.reshape(1), gs);
    232     cuda::multiply(gs, gs, gs);
    233 
    234     Scalar s = cuda::sum(gs);
    235     double sse = s.val[0] + s.val[1] + s.val[2];
    236 
    237     if( sse <= 1e-10) // for small values return zero
    238         return 0;
    239     else
    240     {
    241         double  mse =sse /(double)(gI1.channels() * I1.total());
    242         double psnr = 10.0*log10((255*255)/mse);
    243         return psnr;
    244     }
    245 }
    246 //! [getpsnrcuda]
    247 
    248 //! [getssim]
    249 Scalar getMSSIM( const Mat& i1, const Mat& i2)
    250 {
    251     const double C1 = 6.5025, C2 = 58.5225;
    252     /***************************** INITS **********************************/
    253     int d     = CV_32F;
    254 
    255     Mat I1, I2;
    256     i1.convertTo(I1, d);           // cannot calculate on one byte large values
    257     i2.convertTo(I2, d);
    258 
    259     Mat I2_2   = I2.mul(I2);        // I2^2
    260     Mat I1_2   = I1.mul(I1);        // I1^2
    261     Mat I1_I2  = I1.mul(I2);        // I1 * I2
    262 
    263     /*************************** END INITS **********************************/
    264 
    265     Mat mu1, mu2;   // PRELIMINARY COMPUTING
    266     GaussianBlur(I1, mu1, Size(11, 11), 1.5);
    267     GaussianBlur(I2, mu2, Size(11, 11), 1.5);
    268 
    269     Mat mu1_2   =   mu1.mul(mu1);
    270     Mat mu2_2   =   mu2.mul(mu2);
    271     Mat mu1_mu2 =   mu1.mul(mu2);
    272 
    273     Mat sigma1_2, sigma2_2, sigma12;
    274 
    275     GaussianBlur(I1_2, sigma1_2, Size(11, 11), 1.5);
    276     sigma1_2 -= mu1_2;
    277 
    278     GaussianBlur(I2_2, sigma2_2, Size(11, 11), 1.5);
    279     sigma2_2 -= mu2_2;
    280 
    281     GaussianBlur(I1_I2, sigma12, Size(11, 11), 1.5);
    282     sigma12 -= mu1_mu2;
    283 
    284     ///////////////////////////////// FORMULA ////////////////////////////////
    285     Mat t1, t2, t3;
    286 
    287     t1 = 2 * mu1_mu2 + C1;
    288     t2 = 2 * sigma12 + C2;
    289     t3 = t1.mul(t2);              // t3 = ((2*mu1_mu2 + C1).*(2*sigma12 + C2))
    290 
    291     t1 = mu1_2 + mu2_2 + C1;
    292     t2 = sigma1_2 + sigma2_2 + C2;
    293     t1 = t1.mul(t2);               // t1 =((mu1_2 + mu2_2 + C1).*(sigma1_2 + sigma2_2 + C2))
    294 
    295     Mat ssim_map;
    296     divide(t3, t1, ssim_map);      // ssim_map =  t3./t1;
    297 
    298     Scalar mssim = mean( ssim_map ); // mssim = average of ssim map
    299     return mssim;
    300 }
    301 //! [getssim]
    302 
    303 //! [getssimcuda]
    304 Scalar getMSSIM_CUDA( const Mat& i1, const Mat& i2)
    305 {
    306     const float C1 = 6.5025f, C2 = 58.5225f;
    307     /***************************** INITS **********************************/
    308     cuda::GpuMat gI1, gI2, gs1, tmp1,tmp2;
    309 
    310     gI1.upload(i1);
    311     gI2.upload(i2);
    312 
    313     gI1.convertTo(tmp1, CV_MAKE_TYPE(CV_32F, gI1.channels()));
    314     gI2.convertTo(tmp2, CV_MAKE_TYPE(CV_32F, gI2.channels()));
    315 
    316     vector<cuda::GpuMat> vI1, vI2;
    317     cuda::split(tmp1, vI1);
    318     cuda::split(tmp2, vI2);
    319     Scalar mssim;
    320 
    321     Ptr<cuda::Filter> gauss = cuda::createGaussianFilter(vI2[0].type(), -1, Size(11, 11), 1.5);
    322 
    323     for( int i = 0; i < gI1.channels(); ++i )
    324     {
    325         cuda::GpuMat I2_2, I1_2, I1_I2;
    326 
    327         cuda::multiply(vI2[i], vI2[i], I2_2);        // I2^2
    328         cuda::multiply(vI1[i], vI1[i], I1_2);        // I1^2
    329         cuda::multiply(vI1[i], vI2[i], I1_I2);       // I1 * I2
    330 
    331         /*************************** END INITS **********************************/
    332         cuda::GpuMat mu1, mu2;   // PRELIMINARY COMPUTING
    333         gauss->apply(vI1[i], mu1);
    334         gauss->apply(vI2[i], mu2);
    335 
    336         cuda::GpuMat mu1_2, mu2_2, mu1_mu2;
    337         cuda::multiply(mu1, mu1, mu1_2);
    338         cuda::multiply(mu2, mu2, mu2_2);
    339         cuda::multiply(mu1, mu2, mu1_mu2);
    340 
    341         cuda::GpuMat sigma1_2, sigma2_2, sigma12;
    342 
    343         gauss->apply(I1_2, sigma1_2);
    344         cuda::subtract(sigma1_2, mu1_2, sigma1_2); // sigma1_2 -= mu1_2;
    345 
    346         gauss->apply(I2_2, sigma2_2);
    347         cuda::subtract(sigma2_2, mu2_2, sigma2_2); // sigma2_2 -= mu2_2;
    348 
    349         gauss->apply(I1_I2, sigma12);
    350         cuda::subtract(sigma12, mu1_mu2, sigma12); // sigma12 -= mu1_mu2;
    351 
    352         ///////////////////////////////// FORMULA ////////////////////////////////
    353         cuda::GpuMat t1, t2, t3;
    354 
    355         mu1_mu2.convertTo(t1, -1, 2, C1); // t1 = 2 * mu1_mu2 + C1;
    356         sigma12.convertTo(t2, -1, 2, C2); // t2 = 2 * sigma12 + C2;
    357         cuda::multiply(t1, t2, t3);        // t3 = ((2*mu1_mu2 + C1).*(2*sigma12 + C2))
    358 
    359         cuda::addWeighted(mu1_2, 1.0, mu2_2, 1.0, C1, t1);       // t1 = mu1_2 + mu2_2 + C1;
    360         cuda::addWeighted(sigma1_2, 1.0, sigma2_2, 1.0, C2, t2); // t2 = sigma1_2 + sigma2_2 + C2;
    361         cuda::multiply(t1, t2, t1);                              // t1 =((mu1_2 + mu2_2 + C1).*(sigma1_2 + sigma2_2 + C2))
    362 
    363         cuda::GpuMat ssim_map;
    364         cuda::divide(t3, t1, ssim_map);      // ssim_map =  t3./t1;
    365 
    366         Scalar s = cuda::sum(ssim_map);
    367         mssim.val[i] = s.val[0] / (ssim_map.rows * ssim_map.cols);
    368 
    369     }
    370     return mssim;
    371 }
    372 //! [getssimcuda]
    373 
    374 //! [getssimopt]
    375 Scalar getMSSIM_CUDA_optimized( const Mat& i1, const Mat& i2, BufferMSSIM& b)
    376 {
    377     const float C1 = 6.5025f, C2 = 58.5225f;
    378     /***************************** INITS **********************************/
    379 
    380     b.gI1.upload(i1);
    381     b.gI2.upload(i2);
    382 
    383     cuda::Stream stream;
    384 
    385     b.gI1.convertTo(b.t1, CV_32F, stream);
    386     b.gI2.convertTo(b.t2, CV_32F, stream);
    387 
    388     cuda::split(b.t1, b.vI1, stream);
    389     cuda::split(b.t2, b.vI2, stream);
    390     Scalar mssim;
    391 
    392     Ptr<cuda::Filter> gauss = cuda::createGaussianFilter(b.vI1[0].type(), -1, Size(11, 11), 1.5);
    393 
    394     for( int i = 0; i < b.gI1.channels(); ++i )
    395     {
    396         cuda::multiply(b.vI2[i], b.vI2[i], b.I2_2, 1, -1, stream);        // I2^2
    397         cuda::multiply(b.vI1[i], b.vI1[i], b.I1_2, 1, -1, stream);        // I1^2
    398         cuda::multiply(b.vI1[i], b.vI2[i], b.I1_I2, 1, -1, stream);       // I1 * I2
    399 
    400         gauss->apply(b.vI1[i], b.mu1, stream);
    401         gauss->apply(b.vI2[i], b.mu2, stream);
    402 
    403         cuda::multiply(b.mu1, b.mu1, b.mu1_2, 1, -1, stream);
    404         cuda::multiply(b.mu2, b.mu2, b.mu2_2, 1, -1, stream);
    405         cuda::multiply(b.mu1, b.mu2, b.mu1_mu2, 1, -1, stream);
    406 
    407         gauss->apply(b.I1_2, b.sigma1_2, stream);
    408         cuda::subtract(b.sigma1_2, b.mu1_2, b.sigma1_2, cuda::GpuMat(), -1, stream);
    409         //b.sigma1_2 -= b.mu1_2;  - This would result in an extra data transfer operation
    410 
    411         gauss->apply(b.I2_2, b.sigma2_2, stream);
    412         cuda::subtract(b.sigma2_2, b.mu2_2, b.sigma2_2, cuda::GpuMat(), -1, stream);
    413         //b.sigma2_2 -= b.mu2_2;
    414 
    415         gauss->apply(b.I1_I2, b.sigma12, stream);
    416         cuda::subtract(b.sigma12, b.mu1_mu2, b.sigma12, cuda::GpuMat(), -1, stream);
    417         //b.sigma12 -= b.mu1_mu2;
    418 
    419         //here too it would be an extra data transfer due to call of operator*(Scalar, Mat)
    420         cuda::multiply(b.mu1_mu2, 2, b.t1, 1, -1, stream); //b.t1 = 2 * b.mu1_mu2 + C1;
    421         cuda::add(b.t1, C1, b.t1, cuda::GpuMat(), -1, stream);
    422         cuda::multiply(b.sigma12, 2, b.t2, 1, -1, stream); //b.t2 = 2 * b.sigma12 + C2;
    423         cuda::add(b.t2, C2, b.t2, cuda::GpuMat(), -12, stream);
    424 
    425         cuda::multiply(b.t1, b.t2, b.t3, 1, -1, stream);     // t3 = ((2*mu1_mu2 + C1).*(2*sigma12 + C2))
    426 
    427         cuda::add(b.mu1_2, b.mu2_2, b.t1, cuda::GpuMat(), -1, stream);
    428         cuda::add(b.t1, C1, b.t1, cuda::GpuMat(), -1, stream);
    429 
    430         cuda::add(b.sigma1_2, b.sigma2_2, b.t2, cuda::GpuMat(), -1, stream);
    431         cuda::add(b.t2, C2, b.t2, cuda::GpuMat(), -1, stream);
    432 
    433 
    434         cuda::multiply(b.t1, b.t2, b.t1, 1, -1, stream);     // t1 =((mu1_2 + mu2_2 + C1).*(sigma1_2 + sigma2_2 + C2))
    435         cuda::divide(b.t3, b.t1, b.ssim_map, 1, -1, stream);      // ssim_map =  t3./t1;
    436 
    437         stream.waitForCompletion();
    438 
    439         Scalar s = cuda::sum(b.ssim_map, b.buf);
    440         mssim.val[i] = s.val[0] / (b.ssim_map.rows * b.ssim_map.cols);
    441 
    442     }
    443     return mssim;
    444 }
    445 //! [getssimopt]
    446