Home | History | Annotate | Download | only in x86
      1 /*
      2  *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
      3  *
      4  *  Use of this source code is governed by a BSD-style license
      5  *  that can be found in the LICENSE file in the root of the source
      6  *  tree. An additional intellectual property rights grant can be found
      7  *  in the file PATENTS.  All contributing project authors may
      8  *  be found in the AUTHORS file in the root of the source tree.
      9  */
     10 #include <immintrin.h>  // AVX2
     11 #include "vpx/vpx_integer.h"
     12 
     13 void vp9_sad32x32x4d_avx2(uint8_t *src,
     14                           int src_stride,
     15                           uint8_t *ref[4],
     16                           int ref_stride,
     17                           unsigned int res[4]) {
     18   __m256i src_reg, ref0_reg, ref1_reg, ref2_reg, ref3_reg;
     19   __m256i sum_ref0, sum_ref1, sum_ref2, sum_ref3;
     20   __m256i sum_mlow, sum_mhigh;
     21   int i;
     22   uint8_t *ref0, *ref1, *ref2, *ref3;
     23 
     24   ref0 = ref[0];
     25   ref1 = ref[1];
     26   ref2 = ref[2];
     27   ref3 = ref[3];
     28   sum_ref0 = _mm256_set1_epi16(0);
     29   sum_ref1 = _mm256_set1_epi16(0);
     30   sum_ref2 = _mm256_set1_epi16(0);
     31   sum_ref3 = _mm256_set1_epi16(0);
     32   for (i = 0; i < 32 ; i++) {
     33     // load src and all refs
     34     src_reg = _mm256_load_si256((__m256i *)(src));
     35     ref0_reg = _mm256_loadu_si256((__m256i *) (ref0));
     36     ref1_reg = _mm256_loadu_si256((__m256i *) (ref1));
     37     ref2_reg = _mm256_loadu_si256((__m256i *) (ref2));
     38     ref3_reg = _mm256_loadu_si256((__m256i *) (ref3));
     39     // sum of the absolute differences between every ref-i to src
     40     ref0_reg = _mm256_sad_epu8(ref0_reg, src_reg);
     41     ref1_reg = _mm256_sad_epu8(ref1_reg, src_reg);
     42     ref2_reg = _mm256_sad_epu8(ref2_reg, src_reg);
     43     ref3_reg = _mm256_sad_epu8(ref3_reg, src_reg);
     44     // sum every ref-i
     45     sum_ref0 = _mm256_add_epi32(sum_ref0, ref0_reg);
     46     sum_ref1 = _mm256_add_epi32(sum_ref1, ref1_reg);
     47     sum_ref2 = _mm256_add_epi32(sum_ref2, ref2_reg);
     48     sum_ref3 = _mm256_add_epi32(sum_ref3, ref3_reg);
     49 
     50     src+= src_stride;
     51     ref0+= ref_stride;
     52     ref1+= ref_stride;
     53     ref2+= ref_stride;
     54     ref3+= ref_stride;
     55   }
     56   {
     57     __m128i sum;
     58     // in sum_ref-i the result is saved in the first 4 bytes
     59     // the other 4 bytes are zeroed.
     60     // sum_ref1 and sum_ref3 are shifted left by 4 bytes
     61     sum_ref1 = _mm256_slli_si256(sum_ref1, 4);
     62     sum_ref3 = _mm256_slli_si256(sum_ref3, 4);
     63 
     64     // merge sum_ref0 and sum_ref1 also sum_ref2 and sum_ref3
     65     sum_ref0 = _mm256_or_si256(sum_ref0, sum_ref1);
     66     sum_ref2 = _mm256_or_si256(sum_ref2, sum_ref3);
     67 
     68     // merge every 64 bit from each sum_ref-i
     69     sum_mlow = _mm256_unpacklo_epi64(sum_ref0, sum_ref2);
     70     sum_mhigh = _mm256_unpackhi_epi64(sum_ref0, sum_ref2);
     71 
     72     // add the low 64 bit to the high 64 bit
     73     sum_mlow = _mm256_add_epi32(sum_mlow, sum_mhigh);
     74 
     75     // add the low 128 bit to the high 128 bit
     76     sum = _mm_add_epi32(_mm256_castsi256_si128(sum_mlow),
     77                         _mm256_extractf128_si256(sum_mlow, 1));
     78 
     79     _mm_storeu_si128((__m128i *)(res), sum);
     80   }
     81 }
     82 
     83 void vp9_sad64x64x4d_avx2(uint8_t *src,
     84                           int src_stride,
     85                           uint8_t *ref[4],
     86                           int ref_stride,
     87                           unsigned int res[4]) {
     88   __m256i src_reg, srcnext_reg, ref0_reg, ref0next_reg;
     89   __m256i ref1_reg, ref1next_reg, ref2_reg, ref2next_reg;
     90   __m256i ref3_reg, ref3next_reg;
     91   __m256i sum_ref0, sum_ref1, sum_ref2, sum_ref3;
     92   __m256i sum_mlow, sum_mhigh;
     93   int i;
     94   uint8_t *ref0, *ref1, *ref2, *ref3;
     95 
     96   ref0 = ref[0];
     97   ref1 = ref[1];
     98   ref2 = ref[2];
     99   ref3 = ref[3];
    100   sum_ref0 = _mm256_set1_epi16(0);
    101   sum_ref1 = _mm256_set1_epi16(0);
    102   sum_ref2 = _mm256_set1_epi16(0);
    103   sum_ref3 = _mm256_set1_epi16(0);
    104   for (i = 0; i < 64 ; i++) {
    105     // load 64 bytes from src and all refs
    106     src_reg = _mm256_load_si256((__m256i *)(src));
    107     srcnext_reg = _mm256_load_si256((__m256i *)(src + 32));
    108     ref0_reg = _mm256_loadu_si256((__m256i *) (ref0));
    109     ref0next_reg = _mm256_loadu_si256((__m256i *) (ref0 + 32));
    110     ref1_reg = _mm256_loadu_si256((__m256i *) (ref1));
    111     ref1next_reg = _mm256_loadu_si256((__m256i *) (ref1 + 32));
    112     ref2_reg = _mm256_loadu_si256((__m256i *) (ref2));
    113     ref2next_reg = _mm256_loadu_si256((__m256i *) (ref2 + 32));
    114     ref3_reg = _mm256_loadu_si256((__m256i *) (ref3));
    115     ref3next_reg = _mm256_loadu_si256((__m256i *) (ref3 + 32));
    116     // sum of the absolute differences between every ref-i to src
    117     ref0_reg = _mm256_sad_epu8(ref0_reg, src_reg);
    118     ref1_reg = _mm256_sad_epu8(ref1_reg, src_reg);
    119     ref2_reg = _mm256_sad_epu8(ref2_reg, src_reg);
    120     ref3_reg = _mm256_sad_epu8(ref3_reg, src_reg);
    121     ref0next_reg = _mm256_sad_epu8(ref0next_reg, srcnext_reg);
    122     ref1next_reg = _mm256_sad_epu8(ref1next_reg, srcnext_reg);
    123     ref2next_reg = _mm256_sad_epu8(ref2next_reg, srcnext_reg);
    124     ref3next_reg = _mm256_sad_epu8(ref3next_reg, srcnext_reg);
    125 
    126     // sum every ref-i
    127     sum_ref0 = _mm256_add_epi32(sum_ref0, ref0_reg);
    128     sum_ref1 = _mm256_add_epi32(sum_ref1, ref1_reg);
    129     sum_ref2 = _mm256_add_epi32(sum_ref2, ref2_reg);
    130     sum_ref3 = _mm256_add_epi32(sum_ref3, ref3_reg);
    131     sum_ref0 = _mm256_add_epi32(sum_ref0, ref0next_reg);
    132     sum_ref1 = _mm256_add_epi32(sum_ref1, ref1next_reg);
    133     sum_ref2 = _mm256_add_epi32(sum_ref2, ref2next_reg);
    134     sum_ref3 = _mm256_add_epi32(sum_ref3, ref3next_reg);
    135     src+= src_stride;
    136     ref0+= ref_stride;
    137     ref1+= ref_stride;
    138     ref2+= ref_stride;
    139     ref3+= ref_stride;
    140   }
    141   {
    142     __m128i sum;
    143 
    144     // in sum_ref-i the result is saved in the first 4 bytes
    145     // the other 4 bytes are zeroed.
    146     // sum_ref1 and sum_ref3 are shifted left by 4 bytes
    147     sum_ref1 = _mm256_slli_si256(sum_ref1, 4);
    148     sum_ref3 = _mm256_slli_si256(sum_ref3, 4);
    149 
    150     // merge sum_ref0 and sum_ref1 also sum_ref2 and sum_ref3
    151     sum_ref0 = _mm256_or_si256(sum_ref0, sum_ref1);
    152     sum_ref2 = _mm256_or_si256(sum_ref2, sum_ref3);
    153 
    154     // merge every 64 bit from each sum_ref-i
    155     sum_mlow = _mm256_unpacklo_epi64(sum_ref0, sum_ref2);
    156     sum_mhigh = _mm256_unpackhi_epi64(sum_ref0, sum_ref2);
    157 
    158     // add the low 64 bit to the high 64 bit
    159     sum_mlow = _mm256_add_epi32(sum_mlow, sum_mhigh);
    160 
    161     // add the low 128 bit to the high 128 bit
    162     sum = _mm_add_epi32(_mm256_castsi256_si128(sum_mlow),
    163                         _mm256_extractf128_si256(sum_mlow, 1));
    164 
    165     _mm_storeu_si128((__m128i *)(res), sum);
    166   }
    167 }
    168