Home | History | Annotate | Download | only in x86
      1 /*
      2  *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
      3  *
      4  *  Use of this source code is governed by a BSD-style license
      5  *  that can be found in the LICENSE file in the root of the source
      6  *  tree. An additional intellectual property rights grant can be found
      7  *  in the file PATENTS.  All contributing project authors may
      8  *  be found in the AUTHORS file in the root of the source tree.
      9  */
     10 #include <immintrin.h>  // AVX2
     11 #include "./vpx_dsp_rtcd.h"
     12 #include "vpx/vpx_integer.h"
     13 
     14 void vpx_sad32x32x4d_avx2(const uint8_t *src,
     15                           int src_stride,
     16                           const uint8_t *const ref[4],
     17                           int ref_stride,
     18                           uint32_t res[4]) {
     19   __m256i src_reg, ref0_reg, ref1_reg, ref2_reg, ref3_reg;
     20   __m256i sum_ref0, sum_ref1, sum_ref2, sum_ref3;
     21   __m256i sum_mlow, sum_mhigh;
     22   int i;
     23   const uint8_t *ref0, *ref1, *ref2, *ref3;
     24 
     25   ref0 = ref[0];
     26   ref1 = ref[1];
     27   ref2 = ref[2];
     28   ref3 = ref[3];
     29   sum_ref0 = _mm256_set1_epi16(0);
     30   sum_ref1 = _mm256_set1_epi16(0);
     31   sum_ref2 = _mm256_set1_epi16(0);
     32   sum_ref3 = _mm256_set1_epi16(0);
     33   for (i = 0; i < 32 ; i++) {
     34     // load src and all refs
     35     src_reg = _mm256_loadu_si256((const __m256i *)src);
     36     ref0_reg = _mm256_loadu_si256((const __m256i *)ref0);
     37     ref1_reg = _mm256_loadu_si256((const __m256i *)ref1);
     38     ref2_reg = _mm256_loadu_si256((const __m256i *)ref2);
     39     ref3_reg = _mm256_loadu_si256((const __m256i *)ref3);
     40     // sum of the absolute differences between every ref-i to src
     41     ref0_reg = _mm256_sad_epu8(ref0_reg, src_reg);
     42     ref1_reg = _mm256_sad_epu8(ref1_reg, src_reg);
     43     ref2_reg = _mm256_sad_epu8(ref2_reg, src_reg);
     44     ref3_reg = _mm256_sad_epu8(ref3_reg, src_reg);
     45     // sum every ref-i
     46     sum_ref0 = _mm256_add_epi32(sum_ref0, ref0_reg);
     47     sum_ref1 = _mm256_add_epi32(sum_ref1, ref1_reg);
     48     sum_ref2 = _mm256_add_epi32(sum_ref2, ref2_reg);
     49     sum_ref3 = _mm256_add_epi32(sum_ref3, ref3_reg);
     50 
     51     src+= src_stride;
     52     ref0+= ref_stride;
     53     ref1+= ref_stride;
     54     ref2+= ref_stride;
     55     ref3+= ref_stride;
     56   }
     57   {
     58     __m128i sum;
     59     // in sum_ref-i the result is saved in the first 4 bytes
     60     // the other 4 bytes are zeroed.
     61     // sum_ref1 and sum_ref3 are shifted left by 4 bytes
     62     sum_ref1 = _mm256_slli_si256(sum_ref1, 4);
     63     sum_ref3 = _mm256_slli_si256(sum_ref3, 4);
     64 
     65     // merge sum_ref0 and sum_ref1 also sum_ref2 and sum_ref3
     66     sum_ref0 = _mm256_or_si256(sum_ref0, sum_ref1);
     67     sum_ref2 = _mm256_or_si256(sum_ref2, sum_ref3);
     68 
     69     // merge every 64 bit from each sum_ref-i
     70     sum_mlow = _mm256_unpacklo_epi64(sum_ref0, sum_ref2);
     71     sum_mhigh = _mm256_unpackhi_epi64(sum_ref0, sum_ref2);
     72 
     73     // add the low 64 bit to the high 64 bit
     74     sum_mlow = _mm256_add_epi32(sum_mlow, sum_mhigh);
     75 
     76     // add the low 128 bit to the high 128 bit
     77     sum = _mm_add_epi32(_mm256_castsi256_si128(sum_mlow),
     78                         _mm256_extractf128_si256(sum_mlow, 1));
     79 
     80     _mm_storeu_si128((__m128i *)(res), sum);
     81   }
     82 }
     83 
     84 void vpx_sad64x64x4d_avx2(const uint8_t *src,
     85                           int src_stride,
     86                           const uint8_t *const ref[4],
     87                           int ref_stride,
     88                           uint32_t res[4]) {
     89   __m256i src_reg, srcnext_reg, ref0_reg, ref0next_reg;
     90   __m256i ref1_reg, ref1next_reg, ref2_reg, ref2next_reg;
     91   __m256i ref3_reg, ref3next_reg;
     92   __m256i sum_ref0, sum_ref1, sum_ref2, sum_ref3;
     93   __m256i sum_mlow, sum_mhigh;
     94   int i;
     95   const uint8_t *ref0, *ref1, *ref2, *ref3;
     96 
     97   ref0 = ref[0];
     98   ref1 = ref[1];
     99   ref2 = ref[2];
    100   ref3 = ref[3];
    101   sum_ref0 = _mm256_set1_epi16(0);
    102   sum_ref1 = _mm256_set1_epi16(0);
    103   sum_ref2 = _mm256_set1_epi16(0);
    104   sum_ref3 = _mm256_set1_epi16(0);
    105   for (i = 0; i < 64 ; i++) {
    106     // load 64 bytes from src and all refs
    107     src_reg = _mm256_loadu_si256((const __m256i *)src);
    108     srcnext_reg = _mm256_loadu_si256((const __m256i *)(src + 32));
    109     ref0_reg = _mm256_loadu_si256((const __m256i *)ref0);
    110     ref0next_reg = _mm256_loadu_si256((const __m256i *)(ref0 + 32));
    111     ref1_reg = _mm256_loadu_si256((const __m256i *)ref1);
    112     ref1next_reg = _mm256_loadu_si256((const __m256i *)(ref1 + 32));
    113     ref2_reg = _mm256_loadu_si256((const __m256i *)ref2);
    114     ref2next_reg = _mm256_loadu_si256((const __m256i *)(ref2 + 32));
    115     ref3_reg = _mm256_loadu_si256((const __m256i *)ref3);
    116     ref3next_reg = _mm256_loadu_si256((const __m256i *)(ref3 + 32));
    117     // sum of the absolute differences between every ref-i to src
    118     ref0_reg = _mm256_sad_epu8(ref0_reg, src_reg);
    119     ref1_reg = _mm256_sad_epu8(ref1_reg, src_reg);
    120     ref2_reg = _mm256_sad_epu8(ref2_reg, src_reg);
    121     ref3_reg = _mm256_sad_epu8(ref3_reg, src_reg);
    122     ref0next_reg = _mm256_sad_epu8(ref0next_reg, srcnext_reg);
    123     ref1next_reg = _mm256_sad_epu8(ref1next_reg, srcnext_reg);
    124     ref2next_reg = _mm256_sad_epu8(ref2next_reg, srcnext_reg);
    125     ref3next_reg = _mm256_sad_epu8(ref3next_reg, srcnext_reg);
    126 
    127     // sum every ref-i
    128     sum_ref0 = _mm256_add_epi32(sum_ref0, ref0_reg);
    129     sum_ref1 = _mm256_add_epi32(sum_ref1, ref1_reg);
    130     sum_ref2 = _mm256_add_epi32(sum_ref2, ref2_reg);
    131     sum_ref3 = _mm256_add_epi32(sum_ref3, ref3_reg);
    132     sum_ref0 = _mm256_add_epi32(sum_ref0, ref0next_reg);
    133     sum_ref1 = _mm256_add_epi32(sum_ref1, ref1next_reg);
    134     sum_ref2 = _mm256_add_epi32(sum_ref2, ref2next_reg);
    135     sum_ref3 = _mm256_add_epi32(sum_ref3, ref3next_reg);
    136     src+= src_stride;
    137     ref0+= ref_stride;
    138     ref1+= ref_stride;
    139     ref2+= ref_stride;
    140     ref3+= ref_stride;
    141   }
    142   {
    143     __m128i sum;
    144 
    145     // in sum_ref-i the result is saved in the first 4 bytes
    146     // the other 4 bytes are zeroed.
    147     // sum_ref1 and sum_ref3 are shifted left by 4 bytes
    148     sum_ref1 = _mm256_slli_si256(sum_ref1, 4);
    149     sum_ref3 = _mm256_slli_si256(sum_ref3, 4);
    150 
    151     // merge sum_ref0 and sum_ref1 also sum_ref2 and sum_ref3
    152     sum_ref0 = _mm256_or_si256(sum_ref0, sum_ref1);
    153     sum_ref2 = _mm256_or_si256(sum_ref2, sum_ref3);
    154 
    155     // merge every 64 bit from each sum_ref-i
    156     sum_mlow = _mm256_unpacklo_epi64(sum_ref0, sum_ref2);
    157     sum_mhigh = _mm256_unpackhi_epi64(sum_ref0, sum_ref2);
    158 
    159     // add the low 64 bit to the high 64 bit
    160     sum_mlow = _mm256_add_epi32(sum_mlow, sum_mhigh);
    161 
    162     // add the low 128 bit to the high 128 bit
    163     sum = _mm_add_epi32(_mm256_castsi256_si128(sum_mlow),
    164                         _mm256_extractf128_si256(sum_mlow, 1));
    165 
    166     _mm_storeu_si128((__m128i *)(res), sum);
    167   }
    168 }
    169