Home | History | Annotate | Download | only in x86
      1 /*
      2  *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
      3  *
      4  *  Use of this source code is governed by a BSD-style license
      5  *  that can be found in the LICENSE file in the root of the source
      6  *  tree. An additional intellectual property rights grant can be found
      7  *  in the file PATENTS.  All contributing project authors may
      8  *  be found in the AUTHORS file in the root of the source tree.
      9  */
     10 #include <immintrin.h>
     11 #include "./vpx_dsp_rtcd.h"
     12 #include "vpx_ports/mem.h"
     13 
     14 #define FSAD64_H(h)                                                           \
     15   unsigned int vpx_sad64x##h##_avx2(const uint8_t *src_ptr, int src_stride,   \
     16                                     const uint8_t *ref_ptr, int ref_stride) { \
     17     int i, res;                                                               \
     18     __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg;                           \
     19     __m256i sum_sad = _mm256_setzero_si256();                                 \
     20     __m256i sum_sad_h;                                                        \
     21     __m128i sum_sad128;                                                       \
     22     for (i = 0; i < h; i++) {                                                 \
     23       ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr);                \
     24       ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + 32));         \
     25       sad1_reg = _mm256_sad_epu8(                                             \
     26           ref1_reg, _mm256_loadu_si256((__m256i const *)src_ptr));            \
     27       sad2_reg = _mm256_sad_epu8(                                             \
     28           ref2_reg, _mm256_loadu_si256((__m256i const *)(src_ptr + 32)));     \
     29       sum_sad =                                                               \
     30           _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg));    \
     31       ref_ptr += ref_stride;                                                  \
     32       src_ptr += src_stride;                                                  \
     33     }                                                                         \
     34     sum_sad_h = _mm256_srli_si256(sum_sad, 8);                                \
     35     sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h);                           \
     36     sum_sad128 = _mm256_extracti128_si256(sum_sad, 1);                        \
     37     sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128);  \
     38     res = _mm_cvtsi128_si32(sum_sad128);                                      \
     39     return res;                                                               \
     40   }
     41 
     42 #define FSAD32_H(h)                                                           \
     43   unsigned int vpx_sad32x##h##_avx2(const uint8_t *src_ptr, int src_stride,   \
     44                                     const uint8_t *ref_ptr, int ref_stride) { \
     45     int i, res;                                                               \
     46     __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg;                           \
     47     __m256i sum_sad = _mm256_setzero_si256();                                 \
     48     __m256i sum_sad_h;                                                        \
     49     __m128i sum_sad128;                                                       \
     50     int ref2_stride = ref_stride << 1;                                        \
     51     int src2_stride = src_stride << 1;                                        \
     52     int max = h >> 1;                                                         \
     53     for (i = 0; i < max; i++) {                                               \
     54       ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr);                \
     55       ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + ref_stride)); \
     56       sad1_reg = _mm256_sad_epu8(                                             \
     57           ref1_reg, _mm256_loadu_si256((__m256i const *)src_ptr));            \
     58       sad2_reg = _mm256_sad_epu8(                                             \
     59           ref2_reg,                                                           \
     60           _mm256_loadu_si256((__m256i const *)(src_ptr + src_stride)));       \
     61       sum_sad =                                                               \
     62           _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg));    \
     63       ref_ptr += ref2_stride;                                                 \
     64       src_ptr += src2_stride;                                                 \
     65     }                                                                         \
     66     sum_sad_h = _mm256_srli_si256(sum_sad, 8);                                \
     67     sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h);                           \
     68     sum_sad128 = _mm256_extracti128_si256(sum_sad, 1);                        \
     69     sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128);  \
     70     res = _mm_cvtsi128_si32(sum_sad128);                                      \
     71     return res;                                                               \
     72   }
     73 
     74 #define FSAD64  \
     75   FSAD64_H(64); \
     76   FSAD64_H(32);
     77 
     78 #define FSAD32  \
     79   FSAD32_H(64); \
     80   FSAD32_H(32); \
     81   FSAD32_H(16);
     82 
     83 FSAD64;
     84 FSAD32;
     85 
     86 #undef FSAD64
     87 #undef FSAD32
     88 #undef FSAD64_H
     89 #undef FSAD32_H
     90 
     91 #define FSADAVG64_H(h)                                                        \
     92   unsigned int vpx_sad64x##h##_avg_avx2(                                      \
     93       const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,         \
     94       int ref_stride, const uint8_t *second_pred) {                           \
     95     int i, res;                                                               \
     96     __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg;                           \
     97     __m256i sum_sad = _mm256_setzero_si256();                                 \
     98     __m256i sum_sad_h;                                                        \
     99     __m128i sum_sad128;                                                       \
    100     for (i = 0; i < h; i++) {                                                 \
    101       ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr);                \
    102       ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + 32));         \
    103       ref1_reg = _mm256_avg_epu8(                                             \
    104           ref1_reg, _mm256_loadu_si256((__m256i const *)second_pred));        \
    105       ref2_reg = _mm256_avg_epu8(                                             \
    106           ref2_reg, _mm256_loadu_si256((__m256i const *)(second_pred + 32))); \
    107       sad1_reg = _mm256_sad_epu8(                                             \
    108           ref1_reg, _mm256_loadu_si256((__m256i const *)src_ptr));            \
    109       sad2_reg = _mm256_sad_epu8(                                             \
    110           ref2_reg, _mm256_loadu_si256((__m256i const *)(src_ptr + 32)));     \
    111       sum_sad =                                                               \
    112           _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg));    \
    113       ref_ptr += ref_stride;                                                  \
    114       src_ptr += src_stride;                                                  \
    115       second_pred += 64;                                                      \
    116     }                                                                         \
    117     sum_sad_h = _mm256_srli_si256(sum_sad, 8);                                \
    118     sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h);                           \
    119     sum_sad128 = _mm256_extracti128_si256(sum_sad, 1);                        \
    120     sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128);  \
    121     res = _mm_cvtsi128_si32(sum_sad128);                                      \
    122     return res;                                                               \
    123   }
    124 
    125 #define FSADAVG32_H(h)                                                        \
    126   unsigned int vpx_sad32x##h##_avg_avx2(                                      \
    127       const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,         \
    128       int ref_stride, const uint8_t *second_pred) {                           \
    129     int i, res;                                                               \
    130     __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg;                           \
    131     __m256i sum_sad = _mm256_setzero_si256();                                 \
    132     __m256i sum_sad_h;                                                        \
    133     __m128i sum_sad128;                                                       \
    134     int ref2_stride = ref_stride << 1;                                        \
    135     int src2_stride = src_stride << 1;                                        \
    136     int max = h >> 1;                                                         \
    137     for (i = 0; i < max; i++) {                                               \
    138       ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr);                \
    139       ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + ref_stride)); \
    140       ref1_reg = _mm256_avg_epu8(                                             \
    141           ref1_reg, _mm256_loadu_si256((__m256i const *)second_pred));        \
    142       ref2_reg = _mm256_avg_epu8(                                             \
    143           ref2_reg, _mm256_loadu_si256((__m256i const *)(second_pred + 32))); \
    144       sad1_reg = _mm256_sad_epu8(                                             \
    145           ref1_reg, _mm256_loadu_si256((__m256i const *)src_ptr));            \
    146       sad2_reg = _mm256_sad_epu8(                                             \
    147           ref2_reg,                                                           \
    148           _mm256_loadu_si256((__m256i const *)(src_ptr + src_stride)));       \
    149       sum_sad =                                                               \
    150           _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg));    \
    151       ref_ptr += ref2_stride;                                                 \
    152       src_ptr += src2_stride;                                                 \
    153       second_pred += 64;                                                      \
    154     }                                                                         \
    155     sum_sad_h = _mm256_srli_si256(sum_sad, 8);                                \
    156     sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h);                           \
    157     sum_sad128 = _mm256_extracti128_si256(sum_sad, 1);                        \
    158     sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128);  \
    159     res = _mm_cvtsi128_si32(sum_sad128);                                      \
    160     return res;                                                               \
    161   }
    162 
    163 #define FSADAVG64  \
    164   FSADAVG64_H(64); \
    165   FSADAVG64_H(32);
    166 
    167 #define FSADAVG32  \
    168   FSADAVG32_H(64); \
    169   FSADAVG32_H(32); \
    170   FSADAVG32_H(16);
    171 
    172 FSADAVG64;
    173 FSADAVG32;
    174 
    175 #undef FSADAVG64
    176 #undef FSADAVG32
    177 #undef FSADAVG64_H
    178 #undef FSADAVG32_H
    179