Home | History | Annotate | Download | only in x86
      1 /*
      2  *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
      3  *
      4  *  Use of this source code is governed by a BSD-style license
      5  *  that can be found in the LICENSE file in the root of the source
      6  *  tree. An additional intellectual property rights grant can be found
      7  *  in the file PATENTS.  All contributing project authors may
      8  *  be found in the AUTHORS file in the root of the source tree.
      9  */
     10 #include "./vpx_dsp_rtcd.h"
     11 
     12 typedef void (*get_var_avx2)(const uint8_t *src, int src_stride,
     13                              const uint8_t *ref, int ref_stride,
     14                              unsigned int *sse, int *sum);
     15 
     16 void vpx_get32x32var_avx2(const uint8_t *src, int src_stride,
     17                           const uint8_t *ref, int ref_stride,
     18                           unsigned int *sse, int *sum);
     19 
     20 static void variance_avx2(const uint8_t *src, int src_stride,
     21                           const uint8_t *ref, int  ref_stride,
     22                           int w, int h, unsigned int *sse, int *sum,
     23                           get_var_avx2 var_fn, int block_size) {
     24   int i, j;
     25 
     26   *sse = 0;
     27   *sum = 0;
     28 
     29   for (i = 0; i < h; i += 16) {
     30     for (j = 0; j < w; j += block_size) {
     31       unsigned int sse0;
     32       int sum0;
     33       var_fn(&src[src_stride * i + j], src_stride,
     34              &ref[ref_stride * i + j], ref_stride, &sse0, &sum0);
     35       *sse += sse0;
     36       *sum += sum0;
     37     }
     38   }
     39 }
     40 
     41 
     42 unsigned int vpx_variance16x16_avx2(const uint8_t *src, int src_stride,
     43                                     const uint8_t *ref, int ref_stride,
     44                                     unsigned int *sse) {
     45   int sum;
     46   variance_avx2(src, src_stride, ref, ref_stride, 16, 16,
     47                 sse, &sum, vpx_get16x16var_avx2, 16);
     48   return *sse - (((unsigned int)sum * sum) >> 8);
     49 }
     50 
     51 unsigned int vpx_mse16x16_avx2(const uint8_t *src, int src_stride,
     52                                const uint8_t *ref, int ref_stride,
     53                                unsigned int *sse) {
     54   int sum;
     55   vpx_get16x16var_avx2(src, src_stride, ref, ref_stride, sse, &sum);
     56   return *sse;
     57 }
     58 
     59 unsigned int vpx_variance32x16_avx2(const uint8_t *src, int src_stride,
     60                                     const uint8_t *ref, int ref_stride,
     61                                     unsigned int *sse) {
     62   int sum;
     63   variance_avx2(src, src_stride, ref, ref_stride, 32, 16,
     64                 sse, &sum, vpx_get32x32var_avx2, 32);
     65   return *sse - (((int64_t)sum * sum) >> 9);
     66 }
     67 
     68 unsigned int vpx_variance32x32_avx2(const uint8_t *src, int src_stride,
     69                                     const uint8_t *ref, int ref_stride,
     70                                     unsigned int *sse) {
     71   int sum;
     72   variance_avx2(src, src_stride, ref, ref_stride, 32, 32,
     73                 sse, &sum, vpx_get32x32var_avx2, 32);
     74   return *sse - (((int64_t)sum * sum) >> 10);
     75 }
     76 
     77 unsigned int vpx_variance64x64_avx2(const uint8_t *src, int src_stride,
     78                                     const uint8_t *ref, int ref_stride,
     79                                     unsigned int *sse) {
     80   int sum;
     81   variance_avx2(src, src_stride, ref, ref_stride, 64, 64,
     82                 sse, &sum, vpx_get32x32var_avx2, 32);
     83   return *sse - (((int64_t)sum * sum) >> 12);
     84 }
     85 
     86 unsigned int vpx_variance64x32_avx2(const uint8_t *src, int src_stride,
     87                                     const uint8_t *ref, int ref_stride,
     88                                     unsigned int *sse) {
     89   int sum;
     90   variance_avx2(src, src_stride, ref, ref_stride, 64, 32,
     91                 sse, &sum, vpx_get32x32var_avx2, 32);
     92   return *sse - (((int64_t)sum * sum) >> 11);
     93 }
     94 
     95 unsigned int vpx_sub_pixel_variance32xh_avx2(const uint8_t *src, int src_stride,
     96                                              int x_offset, int y_offset,
     97                                              const uint8_t *dst, int dst_stride,
     98                                              int height,
     99                                              unsigned int *sse);
    100 
    101 unsigned int vpx_sub_pixel_avg_variance32xh_avx2(const uint8_t *src,
    102                                                  int src_stride,
    103                                                  int x_offset,
    104                                                  int y_offset,
    105                                                  const uint8_t *dst,
    106                                                  int dst_stride,
    107                                                  const uint8_t *sec,
    108                                                  int sec_stride,
    109                                                  int height,
    110                                                  unsigned int *sseptr);
    111 
    112 unsigned int vpx_sub_pixel_variance64x64_avx2(const uint8_t *src,
    113                                               int src_stride,
    114                                               int x_offset,
    115                                               int y_offset,
    116                                               const uint8_t *dst,
    117                                               int dst_stride,
    118                                               unsigned int *sse) {
    119   unsigned int sse1;
    120   const int se1 = vpx_sub_pixel_variance32xh_avx2(src, src_stride, x_offset,
    121                                                   y_offset, dst, dst_stride,
    122                                                   64, &sse1);
    123   unsigned int sse2;
    124   const int se2 = vpx_sub_pixel_variance32xh_avx2(src + 32, src_stride,
    125                                                   x_offset, y_offset,
    126                                                   dst + 32, dst_stride,
    127                                                   64, &sse2);
    128   const int se = se1 + se2;
    129   *sse = sse1 + sse2;
    130   return *sse - (((int64_t)se * se) >> 12);
    131 }
    132 
    133 unsigned int vpx_sub_pixel_variance32x32_avx2(const uint8_t *src,
    134                                               int src_stride,
    135                                               int x_offset,
    136                                               int y_offset,
    137                                               const uint8_t *dst,
    138                                               int dst_stride,
    139                                               unsigned int *sse) {
    140   const int se = vpx_sub_pixel_variance32xh_avx2(src, src_stride, x_offset,
    141                                                  y_offset, dst, dst_stride,
    142                                                  32, sse);
    143   return *sse - (((int64_t)se * se) >> 10);
    144 }
    145 
    146 unsigned int vpx_sub_pixel_avg_variance64x64_avx2(const uint8_t *src,
    147                                                   int src_stride,
    148                                                   int x_offset,
    149                                                   int y_offset,
    150                                                   const uint8_t *dst,
    151                                                   int dst_stride,
    152                                                   unsigned int *sse,
    153                                                   const uint8_t *sec) {
    154   unsigned int sse1;
    155   const int se1 = vpx_sub_pixel_avg_variance32xh_avx2(src, src_stride, x_offset,
    156                                                       y_offset, dst, dst_stride,
    157                                                       sec, 64, 64, &sse1);
    158   unsigned int sse2;
    159   const int se2 =
    160     vpx_sub_pixel_avg_variance32xh_avx2(src + 32, src_stride, x_offset,
    161                                         y_offset, dst + 32, dst_stride,
    162                                         sec + 32, 64, 64, &sse2);
    163   const int se = se1 + se2;
    164 
    165   *sse = sse1 + sse2;
    166 
    167   return *sse - (((int64_t)se * se) >> 12);
    168 }
    169 
    170 unsigned int vpx_sub_pixel_avg_variance32x32_avx2(const uint8_t *src,
    171                                                   int src_stride,
    172                                                   int x_offset,
    173                                                   int y_offset,
    174                                                   const uint8_t *dst,
    175                                                   int dst_stride,
    176                                                   unsigned int *sse,
    177                                                   const uint8_t *sec) {
    178   // Process 32 elements in parallel.
    179   const int se = vpx_sub_pixel_avg_variance32xh_avx2(src, src_stride, x_offset,
    180                                                      y_offset, dst, dst_stride,
    181                                                      sec, 32, 32, sse);
    182   return *sse - (((int64_t)se * se) >> 10);
    183 }
    184