Home | History | Annotate | Download | only in x86
      1 /*
      2  *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
      3  *
      4  *  Use of this source code is governed by a BSD-style license
      5  *  that can be found in the LICENSE file in the root of the source
      6  *  tree. An additional intellectual property rights grant can be found
      7  *  in the file PATENTS.  All contributing project authors may
      8  *  be found in the AUTHORS file in the root of the source tree.
      9  */
     10 #include "./vpx_config.h"
     11 
     12 #include "vp9/encoder/vp9_variance.h"
     13 #include "vp9/common/vp9_pragmas.h"
     14 #include "vpx_ports/mem.h"
     15 
     16 typedef void (*get_var_avx2) (
     17   const unsigned char *src_ptr,
     18   int source_stride,
     19   const unsigned char *ref_ptr,
     20   int recon_stride,
     21   unsigned int *SSE,
     22   int *Sum
     23 );
     24 
     25 void vp9_get16x16var_avx2
     26 (
     27   const unsigned char *src_ptr,
     28   int source_stride,
     29   const unsigned char *ref_ptr,
     30   int recon_stride,
     31   unsigned int *SSE,
     32   int *Sum
     33 );
     34 
     35 void vp9_get32x32var_avx2
     36 (
     37   const unsigned char *src_ptr,
     38   int source_stride,
     39   const unsigned char *ref_ptr,
     40   int recon_stride,
     41   unsigned int *SSE,
     42   int *Sum
     43 );
     44 
     45 unsigned int vp9_sub_pixel_variance32xh_avx2
     46 (
     47   const uint8_t *src,
     48   int src_stride,
     49   int x_offset,
     50   int y_offset,
     51   const uint8_t *dst,
     52   int dst_stride,
     53   int height,
     54   unsigned int *sse
     55 );
     56 
     57 unsigned int vp9_sub_pixel_avg_variance32xh_avx2
     58 (
     59   const uint8_t *src,
     60   int src_stride,
     61   int x_offset,
     62   int y_offset,
     63   const uint8_t *dst,
     64   int dst_stride,
     65   const uint8_t *sec,
     66   int sec_stride,
     67   int height,
     68   unsigned int *sseptr
     69 );
     70 
     71 static void variance_avx2(const unsigned char *src_ptr, int  source_stride,
     72                         const unsigned char *ref_ptr, int  recon_stride,
     73                         int  w, int  h, unsigned int *sse, int *sum,
     74                         get_var_avx2 var_fn, int block_size) {
     75   unsigned int sse0;
     76   int sum0;
     77   int i, j;
     78 
     79   *sse = 0;
     80   *sum = 0;
     81 
     82   for (i = 0; i < h; i += 16) {
     83     for (j = 0; j < w; j += block_size) {
     84       // processing 16 rows horizontally each call
     85       var_fn(src_ptr + source_stride * i + j, source_stride,
     86              ref_ptr + recon_stride * i + j, recon_stride, &sse0, &sum0);
     87       *sse += sse0;
     88       *sum += sum0;
     89     }
     90   }
     91 }
     92 
     93 unsigned int vp9_variance16x16_avx2
     94 (
     95   const unsigned char *src_ptr,
     96   int  source_stride,
     97   const unsigned char *ref_ptr,
     98   int  recon_stride,
     99   unsigned int *sse) {
    100   unsigned int var;
    101   int avg;
    102 
    103   variance_avx2(src_ptr, source_stride, ref_ptr, recon_stride, 16, 16,
    104                 &var, &avg, vp9_get16x16var_avx2, 16);
    105   *sse = var;
    106   return (var - (((unsigned int)avg * avg) >> 8));
    107 }
    108 
    109 unsigned int vp9_mse16x16_avx2(
    110   const unsigned char *src_ptr,
    111   int  source_stride,
    112   const unsigned char *ref_ptr,
    113   int  recon_stride,
    114   unsigned int *sse) {
    115   unsigned int sse0;
    116   int sum0;
    117   vp9_get16x16var_avx2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0,
    118                        &sum0);
    119   *sse = sse0;
    120   return sse0;
    121 }
    122 
    123 unsigned int vp9_variance32x32_avx2(const uint8_t *src_ptr,
    124                                     int  source_stride,
    125                                     const uint8_t *ref_ptr,
    126                                     int  recon_stride,
    127                                     unsigned int *sse) {
    128   unsigned int var;
    129   int avg;
    130 
    131   // processing 32 elements vertically in parallel
    132   variance_avx2(src_ptr, source_stride, ref_ptr, recon_stride, 32, 32,
    133                 &var, &avg, vp9_get32x32var_avx2, 32);
    134   *sse = var;
    135   return (var - (((int64_t)avg * avg) >> 10));
    136 }
    137 
    138 unsigned int vp9_variance32x16_avx2(const uint8_t *src_ptr,
    139                                     int  source_stride,
    140                                     const uint8_t *ref_ptr,
    141                                     int  recon_stride,
    142                                     unsigned int *sse) {
    143   unsigned int var;
    144   int avg;
    145 
    146   // processing 32 elements vertically in parallel
    147   variance_avx2(src_ptr, source_stride, ref_ptr, recon_stride, 32, 16,
    148                 &var, &avg, vp9_get32x32var_avx2, 32);
    149   *sse = var;
    150   return (var - (((int64_t)avg * avg) >> 9));
    151 }
    152 
    153 
    154 unsigned int vp9_variance64x64_avx2(const uint8_t *src_ptr,
    155                                     int  source_stride,
    156                                     const uint8_t *ref_ptr,
    157                                     int  recon_stride,
    158                                     unsigned int *sse) {
    159   unsigned int var;
    160   int avg;
    161 
    162   // processing 32 elements vertically in parallel
    163   variance_avx2(src_ptr, source_stride, ref_ptr, recon_stride, 64, 64,
    164                 &var, &avg, vp9_get32x32var_avx2, 32);
    165   *sse = var;
    166   return (var - (((int64_t)avg * avg) >> 12));
    167 }
    168 
    169 unsigned int vp9_variance64x32_avx2(const uint8_t *src_ptr,
    170                                     int  source_stride,
    171                                     const uint8_t *ref_ptr,
    172                                     int  recon_stride,
    173                                     unsigned int *sse) {
    174   unsigned int var;
    175   int avg;
    176 
    177   // processing 32 elements vertically in parallel
    178   variance_avx2(src_ptr, source_stride, ref_ptr, recon_stride, 64, 32,
    179                 &var, &avg, vp9_get32x32var_avx2, 32);
    180 
    181   *sse = var;
    182   return (var - (((int64_t)avg * avg) >> 11));
    183 }
    184 
    185 unsigned int vp9_sub_pixel_variance64x64_avx2(const uint8_t *src,
    186                                               int src_stride,
    187                                               int x_offset,
    188                                               int y_offset,
    189                                               const uint8_t *dst,
    190                                               int dst_stride,
    191                                               unsigned int *sse_ptr) {
    192   // processing 32 elements in parallel
    193   unsigned int sse;
    194   int se = vp9_sub_pixel_variance32xh_avx2(src, src_stride, x_offset,
    195                                            y_offset, dst, dst_stride,
    196                                            64, &sse);
    197   // processing the next 32 elements in parallel
    198   unsigned int sse2;
    199   int se2 = vp9_sub_pixel_variance32xh_avx2(src + 32, src_stride,
    200                                             x_offset, y_offset,
    201                                             dst + 32, dst_stride,
    202                                             64, &sse2);
    203   se += se2;
    204   sse += sse2;
    205   *sse_ptr = sse;
    206   return sse - (((int64_t)se * se) >> 12);
    207 }
    208 
    209 unsigned int vp9_sub_pixel_variance32x32_avx2(const uint8_t *src,
    210                                               int src_stride,
    211                                               int x_offset,
    212                                               int y_offset,
    213                                               const uint8_t *dst,
    214                                               int dst_stride,
    215                                               unsigned int *sse_ptr) {
    216   // processing 32 element in parallel
    217   unsigned int sse;
    218   int se = vp9_sub_pixel_variance32xh_avx2(src, src_stride, x_offset,
    219                                            y_offset, dst, dst_stride,
    220                                            32, &sse);
    221   *sse_ptr = sse;
    222   return sse - (((int64_t)se * se) >> 10);
    223 }
    224 
    225 unsigned int vp9_sub_pixel_avg_variance64x64_avx2(const uint8_t *src,
    226                                                   int src_stride,
    227                                                   int x_offset,
    228                                                   int y_offset,
    229                                                   const uint8_t *dst,
    230                                                   int dst_stride,
    231                                                   unsigned int *sseptr,
    232                                                   const uint8_t *sec) {
    233   // processing 32 elements in parallel
    234   unsigned int sse;
    235 
    236   int se = vp9_sub_pixel_avg_variance32xh_avx2(src, src_stride, x_offset,
    237                                                y_offset, dst, dst_stride,
    238                                                sec, 64, 64, &sse);
    239   unsigned int sse2;
    240   // processing the next 32 elements in parallel
    241   int se2 = vp9_sub_pixel_avg_variance32xh_avx2(src + 32, src_stride, x_offset,
    242                                                 y_offset, dst + 32, dst_stride,
    243                                                 sec + 32, 64, 64, &sse2);
    244   se += se2;
    245   sse += sse2;
    246   *sseptr = sse;
    247 
    248   return sse - (((int64_t)se * se) >> 12);
    249 }
    250 
    251 unsigned int vp9_sub_pixel_avg_variance32x32_avx2(const uint8_t *src,
    252                                                   int src_stride,
    253                                                   int x_offset,
    254                                                   int y_offset,
    255                                                   const uint8_t *dst,
    256                                                   int dst_stride,
    257                                                   unsigned int *sseptr,
    258                                                   const uint8_t *sec) {
    259   // processing 32 element in parallel
    260   unsigned int sse;
    261   int se = vp9_sub_pixel_avg_variance32xh_avx2(src, src_stride, x_offset,
    262                                                  y_offset, dst, dst_stride,
    263                                                  sec, 32, 32, &sse);
    264   *sseptr = sse;
    265   return sse - (((int64_t)se * se) >> 10);
    266 }
    267 
    268 
    269